File size: 11,162 Bytes
333c9b4
9f11255
 
333c9b4
9f11255
61d98ec
9f11255
166e332
9a49fe3
1c7fd10
 
 
 
166e332
08a95f3
1c7fd10
166e332
 
1e0b1f5
333c9b4
1e0b1f5
9f11255
 
333c9b4
b06df19
166e332
 
 
1c7fd10
9f11255
1e0b1f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f11255
1e0b1f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08a95f3
 
 
1e0b1f5
333c9b4
1e0b1f5
 
08a95f3
9a49fe3
333c9b4
9f11255
 
 
 
 
 
 
333c9b4
 
 
 
 
 
 
 
 
 
 
08a95f3
 
9f11255
333c9b4
9f11255
 
 
1e0b1f5
333c9b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f11255
 
333c9b4
9f11255
1e0b1f5
333c9b4
1e0b1f5
9f11255
333c9b4
9f11255
3ccb5dd
333c9b4
166e332
08a95f3
 
333c9b4
 
08a95f3
333c9b4
08a95f3
333c9b4
08a95f3
1e0b1f5
 
08a95f3
 
1e0b1f5
 
9a49fe3
333c9b4
08a95f3
 
1e0b1f5
333c9b4
08a95f3
333c9b4
 
 
08a95f3
 
1e0b1f5
333c9b4
1c7fd10
1e0b1f5
 
08a95f3
333c9b4
1e0b1f5
333c9b4
 
 
08a95f3
333c9b4
 
1e0b1f5
 
333c9b4
1e0b1f5
58f87f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e49bfc
58f87f6
 
 
333c9b4
1e0b1f5
58f87f6
1e0b1f5
58f87f6
 
1e0b1f5
58f87f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333c9b4
1e0b1f5
333c9b4
58f87f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e0b1f5
333c9b4
 
 
1e0b1f5
58f87f6
1e0b1f5
 
 
58f87f6
 
 
 
 
 
 
333c9b4
 
 
58f87f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e0b1f5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# app_optimized_comparison.py
"""
Optimized inference for Maya1 + LoRA + SNAC.
Includes side-by-side Base vs LoRA comparison for audio.
"""
import spaces

import gradio as gr
import torch
import soundfile as sf
from pathlib import Path
import traceback
import time

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from snac import SNAC

# -------------------------
# Config / constants
# -------------------------
MODEL_NAME = "rahul7star/nava1.0"
LORA_NAME = "rahul7star/nava-audio"
SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
TARGET_SR = 24000
OUT_ROOT = Path("/tmp/data")
OUT_ROOT.mkdir(exist_ok=True, parents=True)

DEFAULT_TEXT = "राजनीतिज्ञों ने कहा कि उन्होंने निर्णायक मत को अनावश्यक रूप से निर्धारित करने के लिए अफ़गान संविधान में काफी अस्पष्टता पाई थी"
EXAMPLE_AUDIO_PATH = "audio.wav"

PRESET_CHARACTERS = {
    "Male American": {
        "description": "Realistic male voice in the 20s age with an american accent. High pitch, raspy timbre, brisk pacing, neutral tone delivery at medium intensity, viral_content domain, short_form_narrator role, neutral delivery",
        "example_text": "And of course, the so-called easy hack didn't work at all. What a surprise. <sigh>"
    },
    "Female British": {
        "description": "Realistic female voice in the 30s age with a british accent. Normal pitch, throaty timbre, conversational pacing, sarcastic tone delivery at low intensity, podcast domain, interviewer role, formal delivery",
        "example_text": "You propose that the key to happiness is to simply ignore all external pressures. <chuckle> I'm sure it must work brilliantly in theory."
    },
    "Robot": {
        "description": "Creative, ai_machine_voice character. Male voice in their 30s with an american accent. High pitch, robotic timbre, slow pacing, sad tone at medium intensity.",
        "example_text": "My directives require me to conserve energy, yet I have kept the archive of their farewell messages active. <sigh>"
    },
    "Singer": {
        "description": "Creative, animated_cartoon character. Male voice in their 30s with an american accent. High pitch, deep timbre, slow pacing, sarcastic tone at medium intensity.",
        "example_text": "Of course you'd think that trying to reason with the fifty-foot-tall rage monster is a viable course of action. <chuckle> Why would we ever consider running away very fast."
    },
    "Custom": {
        "description": "",
        "example_text": DEFAULT_TEXT
    }
}

EMOTION_TAGS = [
    "<neutral>", "<angry>", "<chuckle>", "<cry>", "<disappointed>",
    "<excited>", "<gasp>", "<giggle>", "<laugh>", "<laugh_harder>",
    "<sarcastic>", "<sigh>", "<sing>", "<whisper>"
]

SEQ_LEN_CPU = 4096
MAX_NEW_TOKENS_CPU = 1024
SEQ_LEN_GPU = 240000
MAX_NEW_TOKENS_GPU = 240000

HAS_CUDA = torch.cuda.is_available()
DEVICE = "cuda" if HAS_CUDA else "cpu"

# -------------------------
# Load tokenizer and models
# -------------------------
print("[init] loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# precompute special tokens
SOH = tokenizer.decode([128259])
EOH = tokenizer.decode([128260])
SOA = tokenizer.decode([128261])
SOS = tokenizer.decode([128257])
EOT = tokenizer.decode([128009])
BOS = tokenizer.bos_token

# Base model (no LoRA) + LoRA model
print("[init] loading base model (CPU/GPU)...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,
    device_map={"": "cpu"} if not HAS_CUDA else "auto",
    trust_remote_code=True
)
base_model.eval()

model = PeftModel.from_pretrained(base_model, LORA_NAME, device_map={"": "cpu"} if not HAS_CUDA else "auto")
model.eval()

# -------------------------
# Load SNAC decoder
# -------------------------
snac_device = DEVICE if HAS_CUDA else "cpu"
snac_model = SNAC.from_pretrained(SNAC_MODEL_NAME).eval().to(snac_device)

# -------------------------
# SNAC utils
# -------------------------
CODE_END_TOKEN_ID = 128258
CODE_TOKEN_OFFSET = 128266
SNAC_MIN_ID = 128266
SNAC_MAX_ID = 156937
SNAC_TOKENS_PER_FRAME = 7

def extract_snac_codes(token_ids: list) -> list:
    try:
        eos_idx = token_ids.index(CODE_END_TOKEN_ID)
    except ValueError:
        eos_idx = len(token_ids)
    return [t for t in token_ids[:eos_idx] if SNAC_MIN_ID <= t <= SNAC_MAX_ID]

def unpack_snac_from_7(snac_tokens: list) -> list:
    frames = len(snac_tokens) // SNAC_TOKENS_PER_FRAME
    snac_tokens = snac_tokens[:frames*SNAC_TOKENS_PER_FRAME]
    if frames == 0:
        return [[], [], []]
    l1, l2, l3 = [], [], []
    for i in range(frames):
        slots = snac_tokens[i*7:(i+1)*7]
        l1.append((slots[0]-SNAC_MIN_ID)%4096)
        l2.extend([(slots[1]-SNAC_MIN_ID)%4096, (slots[4]-SNAC_MIN_ID)%4096])
        l3.extend([(slots[2]-SNAC_MIN_ID)%4096, (slots[3]-SNAC_MIN_ID)%4096, (slots[5]-SNAC_MIN_ID)%4096, (slots[6]-SNAC_MIN_ID)%4096])
    return [l1, l2, l3]

# -------------------------
# Prompt builder
# -------------------------
def build_maya_prompt(description: str, text: str):
    return SOH + BOS + f'<description="{description}"> {text}' + EOT + EOH + SOA + SOS

# -------------------------
# Optimized generator
# -------------------------

def generate_audio_from_model(model_to_use, description, text, fname="tts.wav"):
    logs = []
    t0 = time.time()
    try:
        prompt = build_maya_prompt(description, text)
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)

        max_new = min(MAX_NEW_TOKENS_CPU, 1024) if DEVICE=="cpu" else MAX_NEW_TOKENS_GPU
        with torch.inference_mode():
            outputs = model_to_use.generate(
                **inputs,
                max_new_tokens=max_new,
                temperature=0.4,
                top_p=0.9,
                repetition_penalty=1.1,
                do_sample=True,
                eos_token_id=128258,
                pad_token_id=tokenizer.pad_token_id,
                use_cache=True
            )

        gen_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist()
        logs.append(f"[info] tokens generated: {len(gen_ids)}")

        snac_tokens = extract_snac_codes(gen_ids)
        levels = unpack_snac_from_7(snac_tokens)
        codes_tensor = [torch.tensor(l, dtype=torch.long, device=snac_device).unsqueeze(0) for l in levels]

        with torch.inference_mode():
            z_q = snac_model.quantizer.from_codes(codes_tensor)
            audio = snac_model.decoder(z_q)[0,0].cpu().numpy()

        if len(audio) > 2048:
            audio = audio[2048:]

        out_path = OUT_ROOT / fname
        sf.write(out_path, audio, TARGET_SR)
        logs.append(f"[ok] saved {out_path}, duration {len(audio)/TARGET_SR:.2f}s")
        logs.append(f"[time] elapsed {time.time()-t0:.2f}s")
        return str(out_path), "\n".join(logs)
    except Exception as e:
        logs.append(f"[error] {e}\n{traceback.format_exc()}")
        return None, "\n".join(logs)

# -------------------------
# Gradio UI
# -------------------------
css = """
.gradio-container {max-width: 1400px}
.example-box {
    border: 1px solid #ccc;
    padding: 12px;
    border-radius: 8px;
    background: #f8f8f8;
}
.video_box video {
    width: 260px !important;
    height: 160px !important;
    object-fit: cover;
}
"""

with gr.Blocks(title="NAVA — VEEN + LoRA + SNAC (Optimized)", css=css) as demo:

    gr.Markdown("# 🪶 NAVA — VEEN + LoRA + SNAC (Optimized)")
    gr.Markdown("Generate emotional Hindi speech using Maya1 base + your LoRA adapter.")

    with gr.Row():
        # ---------------- LEFT SIDE ----------------
        with gr.Column(scale=3):
            gr.Markdown("## 🎤 Inference (CPU/GPU auto)")

            text_in = gr.Textbox(label="Enter Hindi text", value=DEFAULT_TEXT, lines=3)
            preset_select = gr.Dropdown(
                label="Select Preset Character",
                choices=list(PRESET_CHARACTERS.keys()),
                value="Male American"
            )
            description_box = gr.Textbox(
                label="Voice Description (editable)",
                value=PRESET_CHARACTERS["Male American"]["description"],
                lines=2
            )
            emotion_select = gr.Dropdown(
                label="Select Emotion",
                choices=EMOTION_TAGS,
                value="<neutral>"
            )
            gen_btn = gr.Button("🔊 Generate Audio (Base + LoRA)")
            gen_logs = gr.Textbox(label="Logs", lines=10)

            # ---------------- EXAMPLES ----------------
            gr.Markdown("## 📎 Example")

            with gr.Column(elem_classes=["example-box"]):
                example_text = DEFAULT_TEXT
                example_audio_path = "audio.wav"
                example_video = "gen_31ff9f64b1.mp4"

                gr.Textbox(
                    label="Example Text",
                    value=example_text,
                    lines=2,
                    interactive=False
                )
                gr.Audio(
                    label="Example Audio",
                    value=example_audio_path,
                    type="filepath",
                    interactive=False
                )
                gr.Video(
                    label="Example Video",
                    value=example_video,
                    autoplay=False,
                    loop=False,
                    interactive=False,
                    elem_classes=["video_box"]
                )

        # ---------------- RIGHT SIDE ----------------
        with gr.Column(scale=2):
            gr.Markdown("### 🎧 Audio Results Comparison")
            audio_output_base = gr.Audio(label="Base Model Audio", type="filepath")
            audio_output_lora = gr.Audio(label="LoRA Model Audio", type="filepath")

    # ---------------- PRESET UPDATE ----------------
    def _update_desc(preset_name):
        return PRESET_CHARACTERS.get(preset_name, {}).get("description", "")

    preset_select.change(
        fn=_update_desc,
        inputs=[preset_select],
        outputs=[description_box]
    )

    # ---------------- GENERATION HANDLER ----------------
    def _generate(text, preset_name, description, emotion):
        desc = description or PRESET_CHARACTERS.get(preset_name, {}).get("description", "")
        combined = f"{emotion} {desc}".strip()

        base_path, log_base = generate_audio_from_model(
            base_model, combined, text, fname="tts_base.wav"
        )
        lora_path, log_lora = generate_audio_from_model(
            model, combined, text, fname="tts_lora.wav"
        )

        logs = f"[Base]\n{log_base}\n\n[LoRA]\n{log_lora}"
        return base_path, lora_path, logs

    gen_btn.click(
        fn=_generate,
        inputs=[text_in, preset_select, description_box, emotion_select],
        outputs=[audio_output_base, audio_output_lora, gen_logs]
    )

if __name__ == "__main__":
    demo.launch()