Spaces:
Sleeping
Sleeping
| # tts.py | |
| import os | |
| import torch | |
| import torchaudio | |
| import spaces | |
| import numpy as np | |
| from typing import AsyncGenerator, Generator, Optional, Protocol, Tuple, Union | |
| from numpy.typing import NDArray | |
| from tortoise.api import TextToSpeech | |
| from tortoise.utils.audio import load_audio | |
| # Create cache/output directory | |
| os.makedirs("outputs", exist_ok=True) | |
| # Create a global TTS model instance | |
| tts_model = None | |
| # Define TTSOptions for compatibility with FastRTC | |
| class TortoiseOptions: | |
| def __init__(self, voice_preset="random", voice_file_path=None): | |
| self.voice_preset = voice_preset | |
| self.voice_file_path = voice_file_path | |
| # The main Tortoise TTS wrapper class implementing FastRTC's TTSModel protocol | |
| class TortoiseTTSModel: | |
| def __init__(self): | |
| global tts_model | |
| if tts_model is None: | |
| self._initialize_model() | |
| self.tts_model = tts_model | |
| def _initialize_model(self): | |
| global tts_model | |
| print("Initializing Tortoise-TTS model...") | |
| tts_model = TextToSpeech(use_deepspeed=torch.cuda.is_available()) | |
| print(f"Model initialized. Using device: {next(tts_model.autoregressive.parameters()).device}") | |
| def _generate_speech(self, text, options=None): | |
| options = options or TortoiseOptions() | |
| try: | |
| # Process voice sample if provided | |
| voice_samples = None | |
| if options.voice_file_path and os.path.exists(options.voice_file_path): | |
| print(f"Loading voice from {options.voice_file_path}") | |
| voice_samples, _ = load_audio(options.voice_file_path, 22050) | |
| voice_samples = [voice_samples] | |
| voice_preset = None | |
| else: | |
| voice_preset = options.voice_preset | |
| # Generate speech | |
| print(f"Generating speech for text: {text[:50]}...") | |
| gen = self.tts_model.tts_with_preset( | |
| text, | |
| voice_samples=voice_samples, | |
| preset=voice_preset | |
| ) | |
| # Return the audio data with sample rate | |
| return 24000, gen.squeeze(0).cpu().numpy().astype(np.float32) | |
| except Exception as e: | |
| print(f"Error generating speech: {str(e)}") | |
| raise | |
| def tts(self, text: str, options: Optional[TortoiseOptions] = None) -> Tuple[int, NDArray[np.float32]]: | |
| """Generate speech audio from text in a single call""" | |
| return self._generate_speech(text, options) | |
| async def stream_tts(self, text: str, options: Optional[TortoiseOptions] = None) -> AsyncGenerator[Tuple[int, NDArray[np.float32]], None]: | |
| """Stream speech audio asynchronously in chunks""" | |
| sample_rate, audio_array = self._generate_speech(text, options) | |
| # Split audio into chunks for streaming | |
| chunk_size = 4000 # Adjust chunk size as needed | |
| for i in range(0, len(audio_array), chunk_size): | |
| chunk = audio_array[i:i+chunk_size] | |
| yield sample_rate, chunk | |
| def stream_tts_sync(self, text: str, options: Optional[TortoiseOptions] = None) -> Generator[Tuple[int, NDArray[np.float32]], None, None]: | |
| """Stream speech audio synchronously in chunks""" | |
| sample_rate, audio_array = self._generate_speech(text, options) | |
| # Split audio into chunks for streaming | |
| chunk_size = 4000 # Adjust chunk size as needed | |
| for i in range(0, len(audio_array), chunk_size): | |
| chunk = audio_array[i:i+chunk_size] | |
| yield sample_rate, chunk | |
| # Create a singleton instance for easy import | |
| tortoise_tts = TortoiseTTSModel() | |
| # Legacy function for backward compatibility | |
| async def generate_speech(text, voice_preset="random", voice_file_path=None): | |
| options = TortoiseOptions(voice_preset, voice_file_path) | |
| sample_rate, audio_array = tortoise_tts.tts(text, options) | |
| return f"outputs/tts_output_{hash(text) % 10000}.wav", (sample_rate, torch.from_numpy(audio_array)) |