snorTTS-Indic-v0
snorTTS-Indic-v0 is a multilingual Indic Text-to-Speech (TTS) model capable of generating speech in nine Indic languages: Hindi, Tamil, Telugu, Marathi, Kannada, Malayalam, Punjabi, Gujarati, and Bengali.
👉 Read the full blog: Train a SoTA Multilingual Indic Text-to-Speech (TTS) to learn how we built it.
👉 Try out the model in our playground.
All code, datasets, and models—both base and fine-tuned—used in this work are available below for anyone to use and build upon.
Capabilities
- TTS
- Voice-Cloning
- Code Switching
- Cross-lingual Voice Cloning (Multilingual Voice Transfer)
Model Overview
Item | Details |
---|---|
Architecture | LLaMA-3.2-3B |
Base model | canopylabs/3b-hi-pretrain-research_release |
Audio codec | SNAC @ 24 kHz, 3 codebooks (12,288 new tokens) |
Languages | Hindi, Gujarati, Marathi, Punjabi, Bengali, Telugu, Kannada, Malayalam, Tamil |
Training
For details about the training and dataset, please refer to Train a SoTA Multilingual Indic Text-to-Speech (TTS).
You can find the training script (train_orepheus.py
) in this repository. It is a single, self-contained script for fine-tuning the base model.
👉 Dataset used for training: snorbyte/indic-tts-sample-snac-encoded
Inference
- Install necessary libraries for linux
sudo apt update
sudo apt install -y sox libsox-dev
- Use Python 3.10
- If you already have torch installed, uninstall it. Let unsloth take care of it.
pip uninstall -y torch torchaudio
- Install necessary packages
pip install unsloth loguru snac deepfilternet pydub soundfile librosa torchaudio
from unsloth import FastLanguageModel
from snac import SNAC
import soundfile as sf
import numpy as np
from loguru import logger
from df.enhance import init_df, enhance, save_audio
import torch
import librosa
import torchaudio
import os
#Name of the model
MODEL_NAME = 'snorbyte/snorTTS-Indic-v0'
MAX_SEQ_LENGTH = 4096
HUGGINGFACE_TOKEN = "" # ! Add your hugging face token
# Load the model and tokenizer.
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=MODEL_NAME,
# load_in_4bit=True,
max_seq_length=MAX_SEQ_LENGTH,
token=HUGGINGFACE_TOKEN,
)
logger.success(f"Loaded model: {MODEL_NAME}")
# Load the end of speech token for the tokenizer.
tokeniser_length = 128256
end_of_speech_id = tokeniser_length + 2
pad_token_id = tokeniser_length + 7
audio_start_id = tokeniser_length + 10
pad_token = tokenizer.decode([pad_token_id])
logger.success("Load special tokens for the tokenizer.")
# Wrap Model for Inference
FastLanguageModel.for_inference(model)
logger.success(f"{MODEL_NAME} is ready for inference.")
# Set the padding token and padding side.
tokenizer.pad_token = pad_token
tokenizer.padding_side = "left"
logger.success("Set padding token and padding side for the tokenizer.")
# Load the SNAC model for audio decoding.
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
logger.success("Loaded SNAC model for audio decoding.")
# Load DeepFilter for optional post processing
df_model, df_state, _ = init_df()
# Function to generate audio
def generate_audio(
row, model, tokenizer, user=False, temperature=0.4, top_p=0.9, repetition_penalty=1.05
):
try:
if user:
prompt = row["eval_text_user"]
else:
prompt = row["eval_text_no_user"]
inputs = tokenizer(prompt, add_special_tokens=False, return_tensors="pt")
max_tokens = MAX_SEQ_LENGTH - inputs.input_ids.shape[1]
output = model.generate(
input_ids=inputs.input_ids.to("cuda"),
attention_mask=inputs.attention_mask.to("cuda"),
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty,
eos_token_id=end_of_speech_id,
)
audio_ids = []
for id in output[0]:
if id >= audio_start_id:
audio_ids.append(id.item())
clean_audio_ids = []
for i in range((len(audio_ids) + 1) // 7):
for j in range(7):
clean_audio_ids += [audio_ids[7 * i + j] - audio_start_id]
codes = [[], [], []]
for i in range((len(clean_audio_ids) + 1) // 7):
codes[0].append(clean_audio_ids[7 * i])
codes[1].append(clean_audio_ids[7 * i + 1] - 4096)
codes[2].append(clean_audio_ids[7 * i + 2] - (2 * 4096))
codes[2].append(clean_audio_ids[7 * i + 3] - (3 * 4096))
codes[1].append(clean_audio_ids[7 * i + 4] - (4 * 4096))
codes[2].append(clean_audio_ids[7 * i + 5] - (5 * 4096))
codes[2].append(clean_audio_ids[7 * i + 6] - (6 * 4096))
codes = [
torch.tensor(codes[0]).unsqueeze(0),
torch.tensor(codes[1]).unsqueeze(0),
torch.tensor(codes[2]).unsqueeze(0),
]
try:
audio = snac_model.decode(codes)
except Exception as e:
logger.error(f"Error decoding audio: {e}")
return None
return audio.detach().squeeze().to("cpu").numpy()
except Exception as e:
logger.error(f"Error decoding audio: {e}")
return None
# Run inference.
# * Please refer to the training script to create prompt from SNAC tokens.
row = {
"eval_text_user": f"<custom_token_3><|begin_of_text|>kannada142: ಅಯ್ಯಯ್ಯೋ... Whitefield ಗೆ reach ಆಗೋಕೆ almost 10 hours ಆಯ್ತು you know... traffic was so terrible today <|eot_id|><custom_token_4><custom_token_5><custom_token_1>"
}
eval_sample = generate_audio(row, model, tokenizer, True)
if eval_sample is None:
logger.error("Failed to generate audio for evaluation sample.")
else:
logger.success("Audio Generated. Post Processing Started")
## post-processing settings
filename = "eval.wav"
speed = 1.05 #add speed up according to speaker
denoise = False #denoise if you want
output = eval_sample.astype(np.float32)
#speed up
if abs(speed - 1.0) > 1e-4:
output_t = torch.from_numpy(output).unsqueeze(0)
output_speed, _ = torchaudio.sox_effects.apply_effects_tensor(output_t, 24_000, effects=[["tempo", f"{speed}"]])
output = output_speed.squeeze(0).cpu().numpy()
#denoise
if denoise:
resampled_48k = librosa.resample(output, orig_sr=24_000, target_sr=48_000)
resampled_48k = torch.from_numpy(resampled_48k).unsqueeze(0)
output_48k = enhance(df_model, df_state, resampled_48k)
output_48k = output_48k.squeeze(0).cpu().numpy()
output = librosa.resample(output_48k, orig_sr=48_000, target_sr=24_000)
logger.success("Saving Final Output...")
#save
sf.write(filename, output, 24_000)
logger.success(f"Generated and saved evaluation sample audio as {filename}.")
Prompts
- Standard
{
"eval_text_no_user": f"<custom_token_3><|begin_of_text|>{utterance}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>"
}
{
"eval_text_no_user": f"<custom_token_3><|begin_of_text|>நிச்சயமா. ரோம் ல் இரவு நேரம் ரொம்ப அழகா இருக்கு—piazzaகள் சுத்துறதுக்கு நல்ல நேரம்.<|eot_id|><custom_token_4><custom_token_5><custom_token_1>"
},
- Speaker Specific: (Recommended)
{
"eval_text_user": f"<custom_token_3><|begin_of_text|>{language}{speaker_id}: {utterance}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>"
}
📝
utterance
can be in native language of the speaker, multi-lingual, or code-switched as well.
{
"eval_text_user": f"<custom_token_3><|begin_of_text|>hindi159: चलते रहो इस सफर में बिना रुके, क्योंकि मंज़िलें खुद राह दिखाने लगती हैं <|eot_id|><custom_token_4><custom_token_5><custom_token_1>"
}
{
"eval_text_user": f"<custom_token_3><|begin_of_text|>bengali125: मुझे तो लगा वो आएगा, ஆனா அவன் வந்து full drama பண்ணிட்டான், আর শেষে আবার আমাকে দোষ দিচ্ছে <|eot_id|><custom_token_4><custom_token_5><custom_token_1>"
}
Speaker IDs
Language | Speakers | Recommended Speedup |
---|---|---|
Hindi | [159,49,43] | [1.05,1.1,1.1] |
Tamil | [188,128,176] | [1.1,1.15,1.1] |
Bengali | [125] | [1.1] |
Malayalam | [189,124] | [1.1,1.1] |
Kannada | [142,138,131,59] | [1.05,1.1,1.1,1.1] |
Telugu | [69,133] | [1.1,1.1] |
Punjabi | [191,67,201] | [1.08,1.06,1.1] |
Gujarati | [62,190] | [1.15,1.25] |
Marathi | [205,82,199,203] | [1.05,1.05,1.1,1.15] |
Contact Us
👉 Mail: [email protected]
👉 Website: https://snorbyte.com
Citation
BibTeX:
@misc{indictextaudio2025,
title={snorTTS-Indic-v0: Multilingual Indic TTS},
author={snorbyte},
year={2025},
howpublished={\url{snorbyte/snorTTS-Indic-v0}},
note={Apache-2.0}
}
- Downloads last month
- 780
Model tree for snorbyte/snorTTS-Indic-v0
Unable to build the model tree, the base model loops to the model itself. Learn more.