Spaces:
Sleeping
Sleeping
File size: 11,162 Bytes
333c9b4 9f11255 333c9b4 9f11255 61d98ec 9f11255 166e332 9a49fe3 1c7fd10 166e332 08a95f3 1c7fd10 166e332 1e0b1f5 333c9b4 1e0b1f5 9f11255 333c9b4 b06df19 166e332 1c7fd10 9f11255 1e0b1f5 9f11255 1e0b1f5 08a95f3 1e0b1f5 333c9b4 1e0b1f5 08a95f3 9a49fe3 333c9b4 9f11255 333c9b4 08a95f3 9f11255 333c9b4 9f11255 1e0b1f5 333c9b4 9f11255 333c9b4 9f11255 1e0b1f5 333c9b4 1e0b1f5 9f11255 333c9b4 9f11255 3ccb5dd 333c9b4 166e332 08a95f3 333c9b4 08a95f3 333c9b4 08a95f3 333c9b4 08a95f3 1e0b1f5 08a95f3 1e0b1f5 9a49fe3 333c9b4 08a95f3 1e0b1f5 333c9b4 08a95f3 333c9b4 08a95f3 1e0b1f5 333c9b4 1c7fd10 1e0b1f5 08a95f3 333c9b4 1e0b1f5 333c9b4 08a95f3 333c9b4 1e0b1f5 333c9b4 1e0b1f5 58f87f6 4e49bfc 58f87f6 333c9b4 1e0b1f5 58f87f6 1e0b1f5 58f87f6 1e0b1f5 58f87f6 333c9b4 1e0b1f5 333c9b4 58f87f6 1e0b1f5 333c9b4 1e0b1f5 58f87f6 1e0b1f5 58f87f6 333c9b4 58f87f6 1e0b1f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 |
# app_optimized_comparison.py
"""
Optimized inference for Maya1 + LoRA + SNAC.
Includes side-by-side Base vs LoRA comparison for audio.
"""
import spaces
import gradio as gr
import torch
import soundfile as sf
from pathlib import Path
import traceback
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from snac import SNAC
# -------------------------
# Config / constants
# -------------------------
MODEL_NAME = "rahul7star/nava1.0"
LORA_NAME = "rahul7star/nava-audio"
SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
TARGET_SR = 24000
OUT_ROOT = Path("/tmp/data")
OUT_ROOT.mkdir(exist_ok=True, parents=True)
DEFAULT_TEXT = "राजनीतिज्ञों ने कहा कि उन्होंने निर्णायक मत को अनावश्यक रूप से निर्धारित करने के लिए अफ़गान संविधान में काफी अस्पष्टता पाई थी"
EXAMPLE_AUDIO_PATH = "audio.wav"
PRESET_CHARACTERS = {
"Male American": {
"description": "Realistic male voice in the 20s age with an american accent. High pitch, raspy timbre, brisk pacing, neutral tone delivery at medium intensity, viral_content domain, short_form_narrator role, neutral delivery",
"example_text": "And of course, the so-called easy hack didn't work at all. What a surprise. <sigh>"
},
"Female British": {
"description": "Realistic female voice in the 30s age with a british accent. Normal pitch, throaty timbre, conversational pacing, sarcastic tone delivery at low intensity, podcast domain, interviewer role, formal delivery",
"example_text": "You propose that the key to happiness is to simply ignore all external pressures. <chuckle> I'm sure it must work brilliantly in theory."
},
"Robot": {
"description": "Creative, ai_machine_voice character. Male voice in their 30s with an american accent. High pitch, robotic timbre, slow pacing, sad tone at medium intensity.",
"example_text": "My directives require me to conserve energy, yet I have kept the archive of their farewell messages active. <sigh>"
},
"Singer": {
"description": "Creative, animated_cartoon character. Male voice in their 30s with an american accent. High pitch, deep timbre, slow pacing, sarcastic tone at medium intensity.",
"example_text": "Of course you'd think that trying to reason with the fifty-foot-tall rage monster is a viable course of action. <chuckle> Why would we ever consider running away very fast."
},
"Custom": {
"description": "",
"example_text": DEFAULT_TEXT
}
}
EMOTION_TAGS = [
"<neutral>", "<angry>", "<chuckle>", "<cry>", "<disappointed>",
"<excited>", "<gasp>", "<giggle>", "<laugh>", "<laugh_harder>",
"<sarcastic>", "<sigh>", "<sing>", "<whisper>"
]
SEQ_LEN_CPU = 4096
MAX_NEW_TOKENS_CPU = 1024
SEQ_LEN_GPU = 240000
MAX_NEW_TOKENS_GPU = 240000
HAS_CUDA = torch.cuda.is_available()
DEVICE = "cuda" if HAS_CUDA else "cpu"
# -------------------------
# Load tokenizer and models
# -------------------------
print("[init] loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
# precompute special tokens
SOH = tokenizer.decode([128259])
EOH = tokenizer.decode([128260])
SOA = tokenizer.decode([128261])
SOS = tokenizer.decode([128257])
EOT = tokenizer.decode([128009])
BOS = tokenizer.bos_token
# Base model (no LoRA) + LoRA model
print("[init] loading base model (CPU/GPU)...")
base_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float32,
device_map={"": "cpu"} if not HAS_CUDA else "auto",
trust_remote_code=True
)
base_model.eval()
model = PeftModel.from_pretrained(base_model, LORA_NAME, device_map={"": "cpu"} if not HAS_CUDA else "auto")
model.eval()
# -------------------------
# Load SNAC decoder
# -------------------------
snac_device = DEVICE if HAS_CUDA else "cpu"
snac_model = SNAC.from_pretrained(SNAC_MODEL_NAME).eval().to(snac_device)
# -------------------------
# SNAC utils
# -------------------------
CODE_END_TOKEN_ID = 128258
CODE_TOKEN_OFFSET = 128266
SNAC_MIN_ID = 128266
SNAC_MAX_ID = 156937
SNAC_TOKENS_PER_FRAME = 7
def extract_snac_codes(token_ids: list) -> list:
try:
eos_idx = token_ids.index(CODE_END_TOKEN_ID)
except ValueError:
eos_idx = len(token_ids)
return [t for t in token_ids[:eos_idx] if SNAC_MIN_ID <= t <= SNAC_MAX_ID]
def unpack_snac_from_7(snac_tokens: list) -> list:
frames = len(snac_tokens) // SNAC_TOKENS_PER_FRAME
snac_tokens = snac_tokens[:frames*SNAC_TOKENS_PER_FRAME]
if frames == 0:
return [[], [], []]
l1, l2, l3 = [], [], []
for i in range(frames):
slots = snac_tokens[i*7:(i+1)*7]
l1.append((slots[0]-SNAC_MIN_ID)%4096)
l2.extend([(slots[1]-SNAC_MIN_ID)%4096, (slots[4]-SNAC_MIN_ID)%4096])
l3.extend([(slots[2]-SNAC_MIN_ID)%4096, (slots[3]-SNAC_MIN_ID)%4096, (slots[5]-SNAC_MIN_ID)%4096, (slots[6]-SNAC_MIN_ID)%4096])
return [l1, l2, l3]
# -------------------------
# Prompt builder
# -------------------------
def build_maya_prompt(description: str, text: str):
return SOH + BOS + f'<description="{description}"> {text}' + EOT + EOH + SOA + SOS
# -------------------------
# Optimized generator
# -------------------------
def generate_audio_from_model(model_to_use, description, text, fname="tts.wav"):
logs = []
t0 = time.time()
try:
prompt = build_maya_prompt(description, text)
inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
max_new = min(MAX_NEW_TOKENS_CPU, 1024) if DEVICE=="cpu" else MAX_NEW_TOKENS_GPU
with torch.inference_mode():
outputs = model_to_use.generate(
**inputs,
max_new_tokens=max_new,
temperature=0.4,
top_p=0.9,
repetition_penalty=1.1,
do_sample=True,
eos_token_id=128258,
pad_token_id=tokenizer.pad_token_id,
use_cache=True
)
gen_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist()
logs.append(f"[info] tokens generated: {len(gen_ids)}")
snac_tokens = extract_snac_codes(gen_ids)
levels = unpack_snac_from_7(snac_tokens)
codes_tensor = [torch.tensor(l, dtype=torch.long, device=snac_device).unsqueeze(0) for l in levels]
with torch.inference_mode():
z_q = snac_model.quantizer.from_codes(codes_tensor)
audio = snac_model.decoder(z_q)[0,0].cpu().numpy()
if len(audio) > 2048:
audio = audio[2048:]
out_path = OUT_ROOT / fname
sf.write(out_path, audio, TARGET_SR)
logs.append(f"[ok] saved {out_path}, duration {len(audio)/TARGET_SR:.2f}s")
logs.append(f"[time] elapsed {time.time()-t0:.2f}s")
return str(out_path), "\n".join(logs)
except Exception as e:
logs.append(f"[error] {e}\n{traceback.format_exc()}")
return None, "\n".join(logs)
# -------------------------
# Gradio UI
# -------------------------
css = """
.gradio-container {max-width: 1400px}
.example-box {
border: 1px solid #ccc;
padding: 12px;
border-radius: 8px;
background: #f8f8f8;
}
.video_box video {
width: 260px !important;
height: 160px !important;
object-fit: cover;
}
"""
with gr.Blocks(title="NAVA — VEEN + LoRA + SNAC (Optimized)", css=css) as demo:
gr.Markdown("# 🪶 NAVA — VEEN + LoRA + SNAC (Optimized)")
gr.Markdown("Generate emotional Hindi speech using Maya1 base + your LoRA adapter.")
with gr.Row():
# ---------------- LEFT SIDE ----------------
with gr.Column(scale=3):
gr.Markdown("## 🎤 Inference (CPU/GPU auto)")
text_in = gr.Textbox(label="Enter Hindi text", value=DEFAULT_TEXT, lines=3)
preset_select = gr.Dropdown(
label="Select Preset Character",
choices=list(PRESET_CHARACTERS.keys()),
value="Male American"
)
description_box = gr.Textbox(
label="Voice Description (editable)",
value=PRESET_CHARACTERS["Male American"]["description"],
lines=2
)
emotion_select = gr.Dropdown(
label="Select Emotion",
choices=EMOTION_TAGS,
value="<neutral>"
)
gen_btn = gr.Button("🔊 Generate Audio (Base + LoRA)")
gen_logs = gr.Textbox(label="Logs", lines=10)
# ---------------- EXAMPLES ----------------
gr.Markdown("## 📎 Example")
with gr.Column(elem_classes=["example-box"]):
example_text = DEFAULT_TEXT
example_audio_path = "audio.wav"
example_video = "gen_31ff9f64b1.mp4"
gr.Textbox(
label="Example Text",
value=example_text,
lines=2,
interactive=False
)
gr.Audio(
label="Example Audio",
value=example_audio_path,
type="filepath",
interactive=False
)
gr.Video(
label="Example Video",
value=example_video,
autoplay=False,
loop=False,
interactive=False,
elem_classes=["video_box"]
)
# ---------------- RIGHT SIDE ----------------
with gr.Column(scale=2):
gr.Markdown("### 🎧 Audio Results Comparison")
audio_output_base = gr.Audio(label="Base Model Audio", type="filepath")
audio_output_lora = gr.Audio(label="LoRA Model Audio", type="filepath")
# ---------------- PRESET UPDATE ----------------
def _update_desc(preset_name):
return PRESET_CHARACTERS.get(preset_name, {}).get("description", "")
preset_select.change(
fn=_update_desc,
inputs=[preset_select],
outputs=[description_box]
)
# ---------------- GENERATION HANDLER ----------------
def _generate(text, preset_name, description, emotion):
desc = description or PRESET_CHARACTERS.get(preset_name, {}).get("description", "")
combined = f"{emotion} {desc}".strip()
base_path, log_base = generate_audio_from_model(
base_model, combined, text, fname="tts_base.wav"
)
lora_path, log_lora = generate_audio_from_model(
model, combined, text, fname="tts_lora.wav"
)
logs = f"[Base]\n{log_base}\n\n[LoRA]\n{log_lora}"
return base_path, lora_path, logs
gen_btn.click(
fn=_generate,
inputs=[text_in, preset_select, description_box, emotion_select],
outputs=[audio_output_base, audio_output_lora, gen_logs]
)
if __name__ == "__main__":
demo.launch()
|