import logging import numpy as np import soundfile as sf from typing import Optional, Generator, Tuple from utils.tts_base import TTSBase # Configure logging logger = logging.getLogger(__name__) # Flag to track CosyVoice2 availability COSYVOICE2_AVAILABLE = False DEFAULT_SAMPLE_RATE = 24000 # Try to import CosyVoice2 dependencies try: import torch import torchaudio # Import CosyVoice2 from the correct package # Based on https://github.com/FunAudioLLM/CosyVoice from cosyvoice.cli.cosyvoice import CosyVoice COSYVOICE2_AVAILABLE = True logger.info("CosyVoice2 TTS engine is available") except ImportError as e: logger.warning(f"CosyVoice2 TTS engine is not available - ImportError: {str(e)}") COSYVOICE2_AVAILABLE = False except ModuleNotFoundError as e: logger.warning(f"CosyVoice2 TTS engine is not available - ModuleNotFoundError: {str(e)}") COSYVOICE2_AVAILABLE = False def _get_model(): """Lazy-load the CosyVoice2 model Returns: CosyVoice2 or None: The CosyVoice2 model or None if not available """ if not COSYVOICE2_AVAILABLE: logger.warning("CosyVoice2 TTS engine is not available") return None try: import torch import torchaudio from cosyvoice.cli.cosyvoice import CosyVoice # Initialize the model with correct path model = CosyVoice('pretrained_models/CosyVoice-300M') logger.info("CosyVoice2 model successfully loaded") return model except ImportError as e: logger.error(f"Failed to import CosyVoice2 dependencies: {str(e)}") return None except FileNotFoundError as e: logger.error(f"Failed to load CosyVoice2 model files: {str(e)}") return None except Exception as e: logger.error(f"Failed to initialize CosyVoice2 model: {str(e)}") return None class CosyVoice2TTS(TTSBase): """CosyVoice2 TTS engine implementation This engine uses the CosyVoice2 model for TTS generation. """ def __init__(self, lang_code: str = 'z'): """Initialize the CosyVoice2 TTS engine Args: lang_code (str): Language code for the engine """ super().__init__(lang_code) self.model = None def _ensure_model(self): """Ensure the model is loaded Returns: bool: True if model is available, False otherwise """ if self.model is None: self.model = _get_model() return self.model is not None def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]: """Generate speech using CosyVoice2 TTS engine Args: text (str): Input text to synthesize voice (str): Voice ID (may not be used in CosyVoice2) speed (float): Speech speed multiplier (may not be used in CosyVoice2) Returns: Optional[str]: Path to the generated audio file or None if generation fails """ logger.info(f"Generating speech with CosyVoice2 for text length: {len(text)}") # Check if CosyVoice2 is available if not COSYVOICE2_AVAILABLE: logger.error("CosyVoice2 TTS engine is not available") return None # Ensure model is loaded if not self._ensure_model(): logger.error("Failed to load CosyVoice2 model") return None try: import torch # Generate unique output path output_path = self._generate_output_path(prefix="cosyvoice2") # Generate audio using CosyVoice2 try: # Use the inference method from CosyVoice output_audio_tensor = self.model.inference_sft(text, '中文女') # Convert tensor to numpy array if isinstance(output_audio_tensor, torch.Tensor): output_audio_np = output_audio_tensor.cpu().numpy() else: output_audio_np = output_audio_tensor except Exception as api_error: # Try alternative API if the first one fails try: output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女') if isinstance(output_audio_tensor, torch.Tensor): output_audio_np = output_audio_tensor.cpu().numpy() else: output_audio_np = output_audio_tensor except Exception as alt_error: logger.error(f"CosyVoice2 inference failed: {str(api_error)}") return None if output_audio_np is not None: logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})") sf.write(output_path, output_audio_np, DEFAULT_SAMPLE_RATE) logger.info(f"CosyVoice2 audio generation complete: {output_path}") return output_path else: logger.error("CosyVoice2 model returned None for audio output") return None except Exception as e: logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True) return None def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]: """Generate speech stream using CosyVoice2 TTS engine Args: text (str): Input text to synthesize voice (str): Voice ID (may not be used in CosyVoice2) speed (float): Speech speed multiplier (may not be used in CosyVoice2) Yields: tuple: (sample_rate, audio_data) pairs for each segment """ logger.info(f"Generating speech stream with CosyVoice2 for text length: {len(text)}") # Check if CosyVoice2 is available if not COSYVOICE2_AVAILABLE: logger.error("CosyVoice2 TTS engine is not available") return # Ensure model is loaded if not self._ensure_model(): logger.error("Failed to load CosyVoice2 model") return try: import torch # Generate audio using CosyVoice2 try: # Use the inference method from CosyVoice output_audio_tensor = self.model.inference_sft(text, '中文女') # Convert tensor to numpy array if isinstance(output_audio_tensor, torch.Tensor): output_audio_np = output_audio_tensor.cpu().numpy() else: output_audio_np = output_audio_tensor except Exception as api_error: # Try alternative API if the first one fails try: output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女') if isinstance(output_audio_tensor, torch.Tensor): output_audio_np = output_audio_tensor.cpu().numpy() else: output_audio_np = output_audio_tensor except Exception as alt_error: logger.error(f"CosyVoice2 inference failed: {str(api_error)}") return if output_audio_np is not None: logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})") yield DEFAULT_SAMPLE_RATE, output_audio_np else: logger.error("CosyVoice2 model returned None for audio output") return except Exception as e: logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True) return