Spaces:

DroolingPanda
/

teachingAssistant

Build error

App Files Files Community

Michael Hu commited on 28 days ago

Commit

7495571

1 Parent(s): 37aaac6

refactor tts module

Browse files

Files changed (19) hide show

.vercel/project.json +1 -0
dia/__init__.py +0 -0
dia/audio.py +0 -185
dia/config.py +0 -187
dia/layers.py +0 -624
dia/model.py +0 -455
dia/state.py +0 -207
dia_app_gradio.py +0 -378
utils/tts.py +97 -44
utils/tts_README.md +64 -0
utils/tts_base.py +47 -69
utils/tts_cascading.py +0 -112
utils/tts_cosyvoice2.py +194 -0
utils/tts_dia.py +178 -106
utils/tts_dia_space.py +0 -154
utils/tts_engines.py +0 -419
utils/tts_factory.py +0 -77
utils/tts_kokoro.py +123 -81
utils/tts_kokoro_space.py +0 -100

.vercel/project.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"projectName":"trae_5altap2j"}

dia/__init__.py DELETED Viewed

File without changes

dia/audio.py DELETED Viewed

@@ -1,185 +0,0 @@
-import typing as tp
-import torch
-def build_delay_indices(B: int, T: int, C: int, delay_pattern: tp.List[int]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Precompute (t_idx_BxTxC, indices_BTCx3) so that out[t, c] = in[t - delay[c], c].
-    Negative t_idx => BOS; t_idx >= T => PAD.
-    """
-    delay_arr = torch.tensor(delay_pattern, dtype=torch.int32)
-    t_idx_BxT = torch.broadcast_to(
-        torch.arange(T, dtype=torch.int32)[None, :],
-        [B, T],
-    )
-    t_idx_BxTx1 = t_idx_BxT[..., None]
-    t_idx_BxTxC = t_idx_BxTx1 - delay_arr.view(1, 1, C)
-    b_idx_BxTxC = torch.broadcast_to(
-        torch.arange(B, dtype=torch.int32).view(B, 1, 1),
-        [B, T, C],
-    )
-    c_idx_BxTxC = torch.broadcast_to(
-        torch.arange(C, dtype=torch.int32).view(1, 1, C),
-        [B, T, C],
-    )
-    # We must clamp time indices to [0..T-1] so gather_nd equivalent won't fail
-    t_clamped_BxTxC = torch.clamp(t_idx_BxTxC, 0, T - 1)
-    indices_BTCx3 = torch.stack(
-        [
-            b_idx_BxTxC.reshape(-1),
-            t_clamped_BxTxC.reshape(-1),
-            c_idx_BxTxC.reshape(-1),
-        ],
-        dim=1,
-    ).long()  # Ensure indices are long type for indexing
-    return t_idx_BxTxC, indices_BTCx3
-def apply_audio_delay(
-    audio_BxTxC: torch.Tensor,
-    pad_value: int,
-    bos_value: int,
-    precomp: tp.Tuple[torch.Tensor, torch.Tensor],
-) -> torch.Tensor:
-    """
-    Applies the delay pattern to batched audio tokens using precomputed indices,
-    inserting BOS where t_idx < 0 and PAD where t_idx >= T.
-    Args:
-        audio_BxTxC: [B, T, C] int16 audio tokens (or int32/float)
-        pad_value: the padding token
-        bos_value: the BOS token
-        precomp:  (t_idx_BxTxC, indices_BTCx3) from build_delay_indices
-    Returns:
-        result_BxTxC: [B, T, C] delayed audio tokens
-    """
-    device = audio_BxTxC.device  # Get device from input tensor
-    t_idx_BxTxC, indices_BTCx3 = precomp
-    t_idx_BxTxC = t_idx_BxTxC.to(device)  # Move precomputed indices to device
-    indices_BTCx3 = indices_BTCx3.to(device)
-    # Equivalent of tf.gather_nd using advanced indexing
-    # Ensure indices are long type if not already (build_delay_indices should handle this)
-    gathered_flat = audio_BxTxC[indices_BTCx3[:, 0], indices_BTCx3[:, 1], indices_BTCx3[:, 2]]
-    gathered_BxTxC = gathered_flat.view(audio_BxTxC.shape)
-    # Create masks on the correct device
-    mask_bos = t_idx_BxTxC < 0  # => place bos_value
-    mask_pad = t_idx_BxTxC >= audio_BxTxC.shape[1]  # => place pad_value
-    # Create scalar tensors on the correct device
-    bos_tensor = torch.tensor(bos_value, dtype=audio_BxTxC.dtype, device=device)
-    pad_tensor = torch.tensor(pad_value, dtype=audio_BxTxC.dtype, device=device)
-    # If mask_bos, BOS; else if mask_pad, PAD; else original gather
-    # All tensors should now be on the same device
-    result_BxTxC = torch.where(mask_bos, bos_tensor, torch.where(mask_pad, pad_tensor, gathered_BxTxC))
-    return result_BxTxC
-def build_revert_indices(B: int, T: int, C: int, delay_pattern: tp.List[int]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Precompute indices for the revert operation using PyTorch.
-    Returns:
-        A tuple (t_idx_BxTxC, indices_BTCx3) where:
-            - t_idx_BxTxC is a tensor of shape [B, T, C] computed as time indices plus the delay.
-            - indices_BTCx3 is a tensor of shape [B*T*C, 3] used for gathering, computed from:
-                batch indices, clamped time indices, and channel indices.
-    """
-    # Use default device unless specified otherwise; assumes inputs might define device later
-    device = None  # Or determine dynamically if needed, e.g., from a model parameter
-    delay_arr = torch.tensor(delay_pattern, dtype=torch.int32, device=device)
-    t_idx_BT1 = torch.broadcast_to(torch.arange(T, device=device).unsqueeze(0), [B, T])
-    t_idx_BT1 = t_idx_BT1.unsqueeze(-1)
-    t_idx_BxTxC = torch.minimum(
-        t_idx_BT1 + delay_arr.view(1, 1, C),
-        torch.tensor(T - 1, device=device),
-    )
-    b_idx_BxTxC = torch.broadcast_to(torch.arange(B, device=device).view(B, 1, 1), [B, T, C])
-    c_idx_BxTxC = torch.broadcast_to(torch.arange(C, device=device).view(1, 1, C), [B, T, C])
-    indices_BTCx3 = torch.stack(
-        [
-            b_idx_BxTxC.reshape(-1),
-            t_idx_BxTxC.reshape(-1),
-            c_idx_BxTxC.reshape(-1),
-        ],
-        axis=1,
-    ).long()  # Ensure indices are long type
-    return t_idx_BxTxC, indices_BTCx3
-def revert_audio_delay(
-    audio_BxTxC: torch.Tensor,
-    pad_value: int,
-    precomp: tp.Tuple[torch.Tensor, torch.Tensor],
-    T: int,
-) -> torch.Tensor:
-    """
-    Reverts a delay pattern from batched audio tokens using precomputed indices (PyTorch version).
-    Args:
-        audio_BxTxC: Input delayed audio tensor
-        pad_value: Padding value for out-of-bounds indices
-        precomp: Precomputed revert indices tuple containing:
-            - t_idx_BxTxC: Time offset indices tensor
-            - indices_BTCx3: Gather indices tensor for original audio
-        T: Original sequence length before padding
-    Returns:
-        Reverted audio tensor with same shape as input
-    """
-    t_idx_BxTxC, indices_BTCx3 = precomp
-    device = audio_BxTxC.device  # Get device from input tensor
-    # Move precomputed indices to the same device as audio_BxTxC if they aren't already
-    t_idx_BxTxC = t_idx_BxTxC.to(device)
-    indices_BTCx3 = indices_BTCx3.to(device)
-    # Using PyTorch advanced indexing (equivalent to tf.gather_nd or np equivalent)
-    gathered_flat = audio_BxTxC[indices_BTCx3[:, 0], indices_BTCx3[:, 1], indices_BTCx3[:, 2]]
-    gathered_BxTxC = gathered_flat.view(audio_BxTxC.size())  # Use .size() for robust reshaping
-    # Create pad_tensor on the correct device
-    pad_tensor = torch.tensor(pad_value, dtype=audio_BxTxC.dtype, device=device)
-    # Create T tensor on the correct device for comparison
-    T_tensor = torch.tensor(T, device=device)
-    result_BxTxC = torch.where(t_idx_BxTxC >= T_tensor, pad_tensor, gathered_BxTxC)  # Changed np.where to torch.where
-    return result_BxTxC
-@torch.no_grad()
-@torch.inference_mode()
-def decode(
-    model,
-    audio_codes,
-):
-    """
-    Decodes the given frames into an output audio waveform
-    """
-    if len(audio_codes) != 1:
-        raise ValueError(f"Expected one frame, got {len(audio_codes)}")
-    try:
-        audio_values = model.quantizer.from_codes(audio_codes)
-        audio_values = model.decode(audio_values[0])
-        return audio_values
-    except Exception as e:
-        print(f"Error in decode method: {str(e)}")
-        raise

dia/config.py DELETED Viewed

@@ -1,187 +0,0 @@
-"""Configuration management module for the Dia model.
-This module provides comprehensive configuration management for the Dia model,
-utilizing Pydantic for validation. It defines configurations for data processing,
-model architecture (encoder and decoder), and training settings.
-Key components:
-- DataConfig: Parameters for data loading and preprocessing.
-- EncoderConfig: Architecture details for the encoder module.
-- DecoderConfig: Architecture details for the decoder module.
-- ModelConfig: Combined model architecture settings.
-- TrainingConfig: Training hyperparameters and settings.
-- DiaConfig: Master configuration combining all components.
-"""
-import os
-from typing import Annotated
-from pydantic import BaseModel, BeforeValidator, Field
-class DataConfig(BaseModel, frozen=True):
-    """Configuration for data loading and preprocessing.
-    Attributes:
-        text_length: Maximum length of text sequences (must be multiple of 128).
-        audio_length: Maximum length of audio sequences (must be multiple of 128).
-        channels: Number of audio channels.
-        text_pad_value: Value used for padding text sequences.
-        audio_eos_value: Value representing the end of audio sequences.
-        audio_bos_value: Value representing the beginning of audio sequences.
-        audio_pad_value: Value used for padding audio sequences.
-        delay_pattern: List of delay values for each audio channel.
-    """
-    text_length: Annotated[int, BeforeValidator(lambda x: (x + 127) // 128 * 128)] = Field(gt=0, multiple_of=128)
-    audio_length: Annotated[int, BeforeValidator(lambda x: (x + 127) // 128 * 128)] = Field(gt=0, multiple_of=128)
-    channels: int = Field(default=9, gt=0, multiple_of=1)
-    text_pad_value: int = Field(default=0)
-    audio_eos_value: int = Field(default=1024)
-    audio_pad_value: int = Field(default=1025)
-    audio_bos_value: int = Field(default=1026)
-    delay_pattern: list[Annotated[int, Field(ge=0)]] = Field(default_factory=lambda: [0, 8, 9, 10, 11, 12, 13, 14, 15])
-    def __hash__(self) -> int:
-        """Generate a hash based on all fields of the config."""
-        return hash(
-            (
-                self.text_length,
-                self.audio_length,
-                self.channels,
-                self.text_pad_value,
-                self.audio_pad_value,
-                self.audio_bos_value,
-                self.audio_eos_value,
-                tuple(self.delay_pattern),
-            )
-        )
-class EncoderConfig(BaseModel, frozen=True):
-    """Configuration for the encoder component of the Dia model.
-    Attributes:
-        n_layer: Number of transformer layers.
-        n_embd: Embedding dimension.
-        n_hidden: Hidden dimension size in the MLP layers.
-        n_head: Number of attention heads.
-        head_dim: Dimension per attention head.
-    """
-    n_layer: int = Field(gt=0)
-    n_embd: int = Field(gt=0)
-    n_hidden: int = Field(gt=0)
-    n_head: int = Field(gt=0)
-    head_dim: int = Field(gt=0)
-class DecoderConfig(BaseModel, frozen=True):
-    """Configuration for the decoder component of the Dia model.
-    Attributes:
-        n_layer: Number of transformer layers.
-        n_embd: Embedding dimension.
-        n_hidden: Hidden dimension size in the MLP layers.
-        gqa_query_heads: Number of query heads for grouped-query self-attention.
-        kv_heads: Number of key/value heads for grouped-query self-attention.
-        gqa_head_dim: Dimension per query head for grouped-query self-attention.
-        cross_query_heads: Number of query heads for cross-attention.
-        cross_head_dim: Dimension per cross-attention head.
-    """
-    n_layer: int = Field(gt=0)
-    n_embd: int = Field(gt=0)
-    n_hidden: int = Field(gt=0)
-    gqa_query_heads: int = Field(gt=0)
-    kv_heads: int = Field(gt=0)
-    gqa_head_dim: int = Field(gt=0)
-    cross_query_heads: int = Field(gt=0)
-    cross_head_dim: int = Field(gt=0)
-class ModelConfig(BaseModel, frozen=True):
-    """Main configuration container for the Dia model architecture.
-    Attributes:
-        encoder: Configuration for the encoder component.
-        decoder: Configuration for the decoder component.
-        src_vocab_size: Size of the source (text) vocabulary.
-        tgt_vocab_size: Size of the target (audio code) vocabulary.
-        dropout: Dropout probability applied within the model.
-        normalization_layer_epsilon: Epsilon value for normalization layers (e.g., LayerNorm).
-        weight_dtype: Data type for model weights (e.g., "float32", "bfloat16").
-        rope_min_timescale: Minimum timescale for Rotary Positional Embeddings (RoPE).
-        rope_max_timescale: Maximum timescale for Rotary Positional Embeddings (RoPE).
-    """
-    encoder: EncoderConfig
-    decoder: DecoderConfig
-    src_vocab_size: int = Field(default=128, gt=0)
-    tgt_vocab_size: int = Field(default=1028, gt=0)
-    dropout: float = Field(default=0.0, ge=0.0, lt=1.0)
-    normalization_layer_epsilon: float = Field(default=1.0e-5, ge=0.0)
-    weight_dtype: str = Field(default="float32", description="Weight precision")
-    rope_min_timescale: int = Field(default=1, description="Timescale For global Attention")
-    rope_max_timescale: int = Field(default=10_000, description="Timescale For global Attention")
-class TrainingConfig(BaseModel, frozen=True):
-    pass
-class DiaConfig(BaseModel, frozen=True):
-    """Master configuration for the Dia model.
-    Combines all sub-configurations into a single validated object.
-    Attributes:
-        version: Configuration version string.
-        model: Model architecture configuration.
-        training: Training process configuration (precision settings).
-        data: Data loading and processing configuration.
-    """
-    version: str = Field(default="1.0")
-    model: ModelConfig
-    # TODO: remove training. this is just for backward compatibility
-    training: TrainingConfig | None = Field(default=None)
-    data: DataConfig
-    def save(self, path: str) -> None:
-        """Save the current configuration instance to a JSON file.
-        Ensures the parent directory exists and the file has a .json extension.
-        Args:
-            path: The target file path to save the configuration.
-        Raises:
-            ValueError: If the path is not a file with a .json extension.
-        """
-        os.makedirs(os.path.dirname(path), exist_ok=True)
-        config_json = self.model_dump_json(indent=2)
-        with open(path, "w") as f:
-            f.write(config_json)
-    @classmethod
-    def load(cls, path: str) -> "DiaConfig | None":
-        """Load and validate a Dia configuration from a JSON file.
-        Args:
-            path: The path to the configuration file.
-        Returns:
-            A validated DiaConfig instance if the file exists and is valid,
-            otherwise None if the file is not found.
-        Raises:
-            ValueError: If the path does not point to an existing .json file.
-            pydantic.ValidationError: If the JSON content fails validation against the DiaConfig schema.
-        """
-        try:
-            with open(path, "r") as f:
-                content = f.read()
-            return cls.model_validate_json(content)
-        except FileNotFoundError:
-            return None

dia/layers.py DELETED Viewed

@@ -1,624 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from huggingface_hub import PyTorchModelHubMixin
-from torch import Tensor
-from torch.nn import RMSNorm
-from .config import DiaConfig
-from .state import DecoderInferenceState, EncoderInferenceState, KVCache
-def _normalize_axes(axes: tuple[int, ...], ndim: int) -> tuple[int, ...]:
-    return tuple(ax if ax >= 0 else ndim + ax for ax in axes)
-class DenseGeneral(nn.Module):
-    """
-    PyTorch equivalent of flax.linen.DenseGeneral with shapes defined at init.
-    Stores weights (`kernel`) in the same layout as Jax and uses torch.tensordot
-    for the generalized matrix multiplication. Weight/bias shapes are calculated
-    and parameters created during initialization based on config.
-    `load_weights` validates shapes and copies data.
-    Attributes:
-        axis (Tuple[int, ...]): Input axis or axes to contract.
-        in_shapes (Tuple[int, ...]): Sizes of the input dimensions specified by `axis`.
-        out_features (Tuple[int, ...]): Shape of the output features (non-contracted dims).
-        use_bias (bool): Whether to add a bias term.
-        weight (nn.Parameter): The kernel parameter.
-        bias (Optional[nn.Parameter]): The bias parameter (if use_bias=True).
-    """
-    def __init__(
-        self,
-        in_shapes: tuple[int, ...],
-        out_features: tuple[int, ...],
-        axis: tuple[int, ...] = (-1,),
-        weight_dtype: torch.dtype | None = None,
-        device: torch.device | None = None,
-    ):
-        super().__init__()
-        self.in_shapes = in_shapes
-        self.out_features = out_features
-        self.axis = axis
-        self.kernel_shape = self.in_shapes + self.out_features
-        factory_kwargs = {"device": device, "dtype": weight_dtype}
-        self.weight = nn.Parameter(torch.empty(self.kernel_shape, **factory_kwargs))
-    def forward(self, inputs: Tensor) -> Tensor:
-        norm_axis = _normalize_axes(self.axis, inputs.ndim)
-        kernel_contract_axes = tuple(range(len(norm_axis)))
-        output = torch.tensordot(
-            inputs.to(self.weight.dtype),
-            self.weight,
-            dims=(norm_axis, kernel_contract_axes),
-        ).to(inputs.dtype)
-        return output
-class MlpBlock(nn.Module):
-    """MLP block using DenseGeneral."""
-    def __init__(self, embed_dim: int, intermediate_dim: int, compute_dtype: torch.dtype):
-        super().__init__()
-        self.dtype = compute_dtype
-        self.wi_fused = DenseGeneral(
-            in_shapes=(embed_dim,),
-            out_features=(2, intermediate_dim),
-            axis=(-1,),
-            weight_dtype=compute_dtype,
-        )
-        self.wo = DenseGeneral(
-            in_shapes=(intermediate_dim,),
-            out_features=(embed_dim,),
-            axis=(-1,),
-            weight_dtype=compute_dtype,
-        )
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass."""
-        fused_x = self.wi_fused(x)
-        gate = fused_x[..., 0, :]
-        up = fused_x[..., 1, :]
-        hidden = torch.mul(F.silu(gate), up).to(self.dtype)
-        output = self.wo(hidden)
-        return output
-class RotaryEmbedding(nn.Module):
-    """Rotary Position Embedding (RoPE) implementation in PyTorch."""
-    def __init__(
-        self,
-        embedding_dims: int,
-        min_timescale: int = 1,
-        max_timescale: int = 10000,
-        dtype: torch.dtype = torch.float32,
-    ):
-        super().__init__()
-        if embedding_dims % 2 != 0:
-            raise ValueError("Embedding dim must be even for RoPE.")
-        self.embedding_dims = embedding_dims
-        self.min_timescale = min_timescale
-        self.max_timescale = max_timescale
-        self.compute_dtype = dtype
-        half_embedding_dim = embedding_dims // 2
-        fraction = (2.0 * torch.arange(0, half_embedding_dim)) / embedding_dims
-        timescale = (self.min_timescale * (self.max_timescale / self.min_timescale) ** fraction).to(torch.float32)
-        self.register_buffer("timescale", timescale, persistent=False)
-    def forward(self, inputs: torch.Tensor, position: torch.Tensor):
-        """Applies RoPE."""
-        position = position.unsqueeze(-1).unsqueeze(-1)
-        sinusoid_inp = position / self.timescale
-        sin = torch.sin(sinusoid_inp)
-        cos = torch.cos(sinusoid_inp)
-        first_half, second_half = torch.chunk(inputs.to(torch.float32), 2, dim=-1)
-        first_part = first_half * cos - second_half * sin
-        second_part = second_half * cos + first_half * sin
-        return torch.cat((first_part.to(self.compute_dtype), second_part.to(self.compute_dtype)), dim=-1)
-class Attention(nn.Module):
-    """Attention using DenseGeneral."""
-    def __init__(
-        self,
-        config: DiaConfig,
-        q_embed_dim: int,
-        kv_embed_dim: int,
-        num_query_heads: int,
-        num_kv_heads: int,
-        head_dim: int,
-        compute_dtype: torch.dtype,
-        is_cross_attn: bool = False,
-        out_embed_dim: int | None = None,
-    ):
-        super().__init__()
-        self.num_query_heads = num_query_heads
-        self.num_kv_heads = num_kv_heads
-        self.head_dim = head_dim
-        self.is_cross_attn = is_cross_attn
-        self.output_dim = out_embed_dim if out_embed_dim is not None else q_embed_dim
-        self.projected_query_dim = num_query_heads * head_dim
-        if num_query_heads % num_kv_heads != 0:
-            raise ValueError(f"num_query_heads ({num_query_heads}) must be divisible by num_kv_heads ({num_kv_heads})")
-        self.num_gqa_groups = num_query_heads // num_kv_heads
-        # --- Projection Layers using DenseGeneral ---
-        self.q_proj = DenseGeneral(
-            in_shapes=(q_embed_dim,),
-            out_features=(num_query_heads, head_dim),
-            axis=(-1,),
-            weight_dtype=compute_dtype,
-        )
-        self.k_proj = DenseGeneral(
-            in_shapes=(kv_embed_dim,),
-            out_features=(num_kv_heads, head_dim),
-            axis=(-1,),
-            weight_dtype=compute_dtype,
-        )
-        self.v_proj = DenseGeneral(
-            in_shapes=(kv_embed_dim,),
-            out_features=(num_kv_heads, head_dim),
-            axis=(-1,),
-            weight_dtype=compute_dtype,
-        )
-        self.o_proj = DenseGeneral(
-            in_shapes=(num_query_heads, head_dim),
-            out_features=(self.output_dim,),
-            axis=(-2, -1),
-            weight_dtype=compute_dtype,
-        )
-        # --- Rotary Embedding ---
-        self.rotary_emb = RotaryEmbedding(
-            embedding_dims=self.head_dim,
-            min_timescale=config.model.rope_min_timescale,
-            max_timescale=config.model.rope_max_timescale,
-            dtype=compute_dtype,
-        )
-    def forward(
-        self,
-        Xq: torch.Tensor,  # (B, T, D) T = 1 in AR generation
-        Xkv: torch.Tensor,  # (B, S, E) S = 1 in AR generation
-        q_positions: torch.Tensor,  # (B, T)
-        kv_positions: torch.Tensor | None = None,  # (B, S)
-        attn_mask: torch.Tensor | None = None,  # None in Decoder Self Attention, Valid mask in Others
-        cache: KVCache | None = None,  # None in Encoder, KVCache in Decoder
-        prefill: bool = False,
-        is_causal: bool = False,
-    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
-        """
-        Performs attention calculation with optional KV caching.
-        Args:
-            Xq: Query tensor (B, T, D). T=1 during single-step decoding.
-            Xkv: Key/Value source tensor (B, S, E). S=1 during single-step decoding for self-attn.
-            q_positions: Positions for queries (B, T).
-            kv_positions: Positions for keys/values (B, S). If None, uses q_positions.
-            attn_mask: Attention mask.
-            cache: KVCache.
-            prefill: If True, use prefill mode.
-        Returns:
-            A tuple containing:
-            - output: The attention output tensor (B, T, output_dim).
-            - present_kv: The K/V state to be cached for the next step ((B, N, S_new, H), (B, N, S_new, H)). For self-attn, S_new = S_past + S. For cross-attn, S_new = S_kv.
-        """
-        if kv_positions is None:
-            kv_positions = q_positions
-        original_dtype = Xq.dtype
-        Xq_BxTxNxH = self.q_proj(Xq)
-        Xq_BxTxNxH = self.rotary_emb(Xq_BxTxNxH, position=q_positions)
-        Xq_BxNxTxH = Xq_BxTxNxH.transpose(1, 2)
-        attn_k: torch.Tensor | None = None
-        attn_v: torch.Tensor | None = None
-        if self.is_cross_attn:
-            attn_k, attn_v = cache.k, cache.v
-        else:
-            Xk_BxSxKxH = self.k_proj(Xkv)  # (B, S, K, H)
-            Xv_BxSxKxH = self.v_proj(Xkv)  # (B, S, K, H)
-            Xk_BxSxKxH = self.rotary_emb(Xk_BxSxKxH, position=kv_positions)  # (B, S, K, H)
-            Xk_BxKxSxH = Xk_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
-            Xv_BxKxSxH = Xv_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
-            if cache is None:
-                attn_k = Xk_BxKxSxH
-                attn_v = Xv_BxKxSxH
-            else:
-                if prefill:
-                    attn_k, attn_v = Xk_BxKxSxH, Xv_BxKxSxH
-                    cache.prefill(attn_k, attn_v)
-                else:
-                    attn_k, attn_v = cache.update(Xk_BxKxSxH, Xv_BxKxSxH)
-        attn_output = F.scaled_dot_product_attention(
-            Xq_BxNxTxH,
-            attn_k,
-            attn_v,
-            attn_mask=attn_mask,
-            scale=1.0,
-            enable_gqa=self.num_gqa_groups > 1,
-            is_causal=is_causal,
-        )
-        attn_output = attn_output.transpose(1, 2).contiguous()  # (B, T, N, H)
-        output = self.o_proj(attn_output)
-        return output.to(original_dtype)
-class EncoderLayer(nn.Module):
-    """Transformer Encoder Layer using DenseGeneral."""
-    def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
-        super().__init__()
-        self.config = config
-        model_config = config.model
-        enc_config = config.model.encoder
-        embed_dim = enc_config.n_embd
-        self.compute_dtype = compute_dtype
-        self.pre_sa_norm = RMSNorm(
-            embed_dim,
-            eps=model_config.normalization_layer_epsilon,
-            dtype=torch.float32,
-        )
-        self.self_attention = Attention(
-            config,
-            q_embed_dim=embed_dim,
-            kv_embed_dim=embed_dim,
-            num_query_heads=enc_config.n_head,
-            num_kv_heads=enc_config.n_head,
-            head_dim=enc_config.head_dim,
-            compute_dtype=compute_dtype,
-            is_cross_attn=False,
-            out_embed_dim=embed_dim,
-        )
-        self.post_sa_norm = RMSNorm(
-            embed_dim,
-            eps=model_config.normalization_layer_epsilon,
-            dtype=torch.float32,
-        )
-        self.mlp = MlpBlock(embed_dim=embed_dim, intermediate_dim=enc_config.n_hidden, compute_dtype=compute_dtype)
-    def forward(
-        self,
-        x: torch.Tensor,
-        state: EncoderInferenceState,
-    ) -> torch.Tensor:
-        residual = x
-        x_norm = self.pre_sa_norm(x).to(self.compute_dtype)
-        sa_out = self.self_attention(
-            Xq=x_norm,
-            Xkv=x_norm,
-            q_positions=state.positions,
-            kv_positions=state.positions,
-            attn_mask=state.attn_mask,
-        )
-        x = residual + sa_out
-        residual = x
-        x_norm = self.post_sa_norm(x).to(self.compute_dtype)
-        mlp_out = self.mlp(x_norm)
-        x = residual + mlp_out
-        return x
-class Encoder(nn.Module):
-    """Transformer Encoder Stack using DenseGeneral."""
-    def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
-        super().__init__()
-        self.config = config
-        model_config = config.model
-        enc_config = config.model.encoder
-        self.compute_dtype = compute_dtype
-        self.embedding = nn.Embedding(
-            model_config.src_vocab_size,
-            enc_config.n_embd,
-            dtype=compute_dtype,
-        )
-        self.layers = nn.ModuleList([EncoderLayer(config, compute_dtype) for _ in range(enc_config.n_layer)])
-        self.norm = RMSNorm(
-            enc_config.n_embd,
-            eps=model_config.normalization_layer_epsilon,
-            dtype=torch.float32,
-        )
-    def forward(
-        self,
-        x_ids: torch.Tensor,
-        state: EncoderInferenceState,
-    ) -> torch.Tensor:
-        x = self.embedding(x_ids)
-        for layer in self.layers:
-            x = layer(x, state)
-        x = self.norm(x).to(self.compute_dtype)
-        return x
-class DecoderLayer(nn.Module):
-    """Transformer Decoder Layer using DenseGeneral."""
-    def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
-        super().__init__()
-        self.config = config
-        model_config = config.model
-        dec_config = config.model.decoder
-        enc_config = config.model.encoder
-        dec_embed_dim = dec_config.n_embd
-        enc_embed_dim = enc_config.n_embd
-        self.compute_dtype = compute_dtype
-        # Norms
-        self.pre_sa_norm = RMSNorm(
-            dec_embed_dim,
-            eps=model_config.normalization_layer_epsilon,
-            dtype=torch.float32,
-        )
-        self.pre_ca_norm = RMSNorm(
-            dec_embed_dim,
-            eps=model_config.normalization_layer_epsilon,
-            dtype=torch.float32,
-        )
-        self.pre_mlp_norm = RMSNorm(
-            dec_embed_dim,
-            eps=model_config.normalization_layer_epsilon,
-            dtype=torch.float32,
-        )
-        # Self-Attention (GQA) with Causal Masking
-        self.self_attention = Attention(
-            config,
-            q_embed_dim=dec_embed_dim,
-            kv_embed_dim=dec_embed_dim,
-            num_query_heads=dec_config.gqa_query_heads,
-            num_kv_heads=dec_config.kv_heads,
-            head_dim=dec_config.gqa_head_dim,
-            compute_dtype=compute_dtype,
-            is_cross_attn=False,
-            out_embed_dim=dec_embed_dim,
-        )
-        # Cross-Attention (MHA)
-        self.cross_attention = Attention(
-            config=config,
-            q_embed_dim=dec_embed_dim,
-            kv_embed_dim=enc_embed_dim,  # Note kv_embed_dim
-            num_query_heads=dec_config.cross_query_heads,
-            num_kv_heads=dec_config.cross_query_heads,
-            head_dim=dec_config.cross_head_dim,
-            compute_dtype=compute_dtype,
-            is_cross_attn=True,
-            out_embed_dim=dec_embed_dim,
-        )
-        # MLP
-        self.mlp = MlpBlock(
-            embed_dim=dec_embed_dim,
-            intermediate_dim=dec_config.n_hidden,
-            compute_dtype=compute_dtype,
-        )
-    def forward(
-        self,
-        x: torch.Tensor,
-        state: DecoderInferenceState,
-        self_attn_cache: KVCache | None = None,
-        cross_attn_cache: KVCache | None = None,
-        prefill: bool = False,
-    ) -> torch.Tensor:
-        residual = x
-        x_norm = self.pre_sa_norm(x).to(self.compute_dtype)
-        sa_out = self.self_attention(
-            Xq=x_norm,  # (2, 1, D)
-            Xkv=x_norm,  # (2, 1, D)
-            q_positions=state.dec_positions,  # (2, 1)
-            kv_positions=state.dec_positions,  # (2, 1)
-            attn_mask=None,
-            cache=self_attn_cache,
-            prefill=prefill,
-            is_causal=prefill,
-        )
-        x = residual + sa_out
-        residual = x
-        x_norm = self.pre_ca_norm(x).to(self.compute_dtype)
-        ca_out = self.cross_attention(
-            Xq=x_norm,
-            Xkv=state.enc_out,
-            q_positions=state.dec_positions,
-            kv_positions=state.enc_positions,
-            attn_mask=state.dec_cross_attn_mask,
-            cache=cross_attn_cache,
-        )
-        x = residual + ca_out
-        residual = x
-        x_norm = self.pre_mlp_norm(x).to(self.compute_dtype)
-        mlp_out = self.mlp(x_norm)
-        x = residual + mlp_out
-        return x
-class Decoder(nn.Module):
-    """Transformer Decoder Stack using DenseGeneral."""
-    def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
-        super().__init__()
-        self.config = config
-        model_config = config.model
-        dec_config = config.model.decoder
-        data_config = config.data
-        self.num_channels = data_config.channels
-        self.num_layers = dec_config.n_layer
-        self.embeddings = nn.ModuleList(
-            [
-                nn.Embedding(model_config.tgt_vocab_size, dec_config.n_embd, dtype=compute_dtype)
-                for _ in range(self.num_channels)
-            ]
-        )
-        self.layers = nn.ModuleList(
-            [DecoderLayer(config=config, compute_dtype=compute_dtype) for _ in range(self.num_layers)]
-        )
-        self.norm = RMSNorm(
-            dec_config.n_embd,
-            eps=model_config.normalization_layer_epsilon,
-            dtype=torch.float32,
-        )
-        self.logits_dense = DenseGeneral(
-            in_shapes=(dec_config.n_embd,),
-            out_features=(self.num_channels, model_config.tgt_vocab_size),
-            axis=(-1,),
-            weight_dtype=compute_dtype,
-        )
-    def precompute_cross_attn_cache(
-        self,
-        enc_out: torch.Tensor,  # (B, S, E)
-        enc_positions: torch.Tensor,  # (B, S)
-    ) -> list[KVCache]:
-        """
-        Computes the Key and Value tensors for cross-attention for each layer from the encoder output.
-        """
-        per_layer_kv_cache: list[KVCache] = []
-        for layer in self.layers:
-            cross_attn_module = layer.cross_attention
-            k_proj = cross_attn_module.k_proj(enc_out)
-            v_proj = cross_attn_module.v_proj(enc_out)
-            k_proj = cross_attn_module.rotary_emb(k_proj, position=enc_positions)
-            k = k_proj.transpose(1, 2)
-            v = v_proj.transpose(1, 2)
-            per_layer_kv_cache.append(KVCache.from_kv(k, v))
-        return per_layer_kv_cache
-    def decode_step(
-        self,
-        tgt_ids_Bx1xC: torch.Tensor,  # [B, 1, C]
-        state: DecoderInferenceState,
-    ) -> torch.Tensor:
-        """
-        Performs a single decoding step, managing KV caches layer by layer.
-        Returns:
-            A tuple containing:
-            - logits_Bx1xCV: The final output logits for the current step (B, 1, C*V), cast to float32.
-        """
-        x = None
-        for i in range(self.num_channels):
-            channel_tokens = tgt_ids_Bx1xC[..., i]
-            channel_embed = self.embeddings[i](channel_tokens)
-            x = channel_embed if x is None else x + channel_embed
-        for i, layer in enumerate(self.layers):
-            self_cache = state.self_attn_cache[i]
-            cross_cache = state.cross_attn_cache[i]
-            x = layer(
-                x,  # (2, 1, D)
-                state,
-                self_attn_cache=self_cache,
-                cross_attn_cache=cross_cache,
-            )
-        x = self.norm(x)
-        logits_Bx1xCxV = self.logits_dense(x)
-        return logits_Bx1xCxV.to(torch.float32)
-    def forward(self, tgt_ids_BxTxC: torch.Tensor, state: DecoderInferenceState) -> torch.Tensor:
-        """
-        Forward pass for the Decoder stack, managing KV caches.
-        Args:
-            tgt_ids_BxTxC: Target token IDs (B, T, C).
-            encoder_out: Output from the encoder (B, S, E).
-            tgt_positions: Positions for target sequence (B, T).
-            src_positions: Positions for source sequence (B, S).
-            self_attn_mask: Mask for self-attention.
-            cross_attn_mask: Mask for cross-attention.
-            past_key_values: List containing the self-attention KV cache for each layer
-                             from the previous decoding step. `len(past_key_values)` should
-                             equal `num_layers`.
-            precomputed_cross_attn_kv: A single tuple containing the pre-computed K/V cache
-                                      derived from `encoder_out`. This is passed identically
-                                      to all layers.
-        Returns:
-            A tuple containing:
-            - logits: The final output logits (B, T, C * V), cast to float32.
-            - present_key_values: A list containing the updated self-attention KV cache
-                                 for each layer for the *current* decoding step.
-        """
-        _, _, num_channels_in = tgt_ids_BxTxC.shape
-        assert num_channels_in == self.num_channels, "Input channels mismatch"
-        # Embeddings
-        x = None
-        for i in range(self.num_channels):
-            channel_tokens = tgt_ids_BxTxC[..., i]
-            channel_embed = self.embeddings[i](channel_tokens)
-            x = channel_embed if x is None else x + channel_embed
-        for i, layer in enumerate(self.layers):
-            self_cache = state.self_attn_cache[i]
-            cross_cache = state.cross_attn_cache[i]
-            x = layer(x, state, self_attn_cache=self_cache, cross_attn_cache=cross_cache, prefill=True)
-        # Final Norm
-        x = self.norm(x)
-        logits_BxTxCxV = self.logits_dense(x)
-        return logits_BxTxCxV.to(torch.float32)
-class DiaModel(
-    nn.Module,
-    PyTorchModelHubMixin,
-    repo_url="https://github.com/nari-labs/dia",
-    pipeline_tag="text-to-speech",
-    license="apache-2.0",
-    coders={
-        DiaConfig: (
-            lambda x: x.model_dump(),
-            lambda data: DiaConfig.model_validate(data),
-        ),
-    },
-):
-    """PyTorch Dia Model using DenseGeneral."""
-    def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
-        super().__init__()
-        self.config = config
-        self.encoder = Encoder(config, compute_dtype)
-        self.decoder = Decoder(config, compute_dtype)

dia/model.py DELETED Viewed

@@ -1,455 +0,0 @@
-import time
-from enum import Enum
-import dac
-import numpy as np
-import torch
-import torchaudio
-from .audio import apply_audio_delay, build_delay_indices, build_revert_indices, decode, revert_audio_delay
-from .config import DiaConfig
-from .layers import DiaModel
-from .state import DecoderInferenceState, DecoderOutput, EncoderInferenceState
-DEFAULT_SAMPLE_RATE = 44100
-def _get_default_device():
-    if torch.cuda.is_available():
-        return torch.device("cuda")
-    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-        return torch.device("mps")
-    return torch.device("cpu")
-def _sample_next_token(
-    logits_BCxV: torch.Tensor,
-    temperature: float,
-    top_p: float,
-    cfg_filter_top_k: int | None = None,
-) -> torch.Tensor:
-    if temperature == 0.0:
-        return torch.argmax(logits_BCxV, dim=-1)
-    logits_BCxV = logits_BCxV / temperature
-    if cfg_filter_top_k is not None:
-        _, top_k_indices_BCxV = torch.topk(logits_BCxV, k=cfg_filter_top_k, dim=-1)
-        mask = torch.ones_like(logits_BCxV, dtype=torch.bool)
-        mask.scatter_(dim=-1, index=top_k_indices_BCxV, value=False)
-        logits_BCxV = logits_BCxV.masked_fill(mask, -torch.inf)
-    if top_p < 1.0:
-        probs_BCxV = torch.softmax(logits_BCxV, dim=-1)
-        sorted_probs_BCxV, sorted_indices_BCxV = torch.sort(probs_BCxV, dim=-1, descending=True)
-        cumulative_probs_BCxV = torch.cumsum(sorted_probs_BCxV, dim=-1)
-        sorted_indices_to_remove_BCxV = cumulative_probs_BCxV > top_p
-        sorted_indices_to_remove_BCxV[..., 1:] = sorted_indices_to_remove_BCxV[..., :-1].clone()
-        sorted_indices_to_remove_BCxV[..., 0] = 0
-        indices_to_remove_BCxV = torch.zeros_like(sorted_indices_to_remove_BCxV)
-        indices_to_remove_BCxV.scatter_(dim=-1, index=sorted_indices_BCxV, src=sorted_indices_to_remove_BCxV)
-        logits_BCxV = logits_BCxV.masked_fill(indices_to_remove_BCxV, -torch.inf)
-    final_probs_BCxV = torch.softmax(logits_BCxV, dim=-1)
-    sampled_indices_BC = torch.multinomial(final_probs_BCxV, num_samples=1)
-    sampled_indices_C = sampled_indices_BC.squeeze(-1)
-    return sampled_indices_C
-class ComputeDtype(str, Enum):
-    FLOAT32 = "float32"
-    FLOAT16 = "float16"
-    BFLOAT16 = "bfloat16"
-    def to_dtype(self) -> torch.dtype:
-        if self == ComputeDtype.FLOAT32:
-            return torch.float32
-        elif self == ComputeDtype.FLOAT16:
-            return torch.float16
-        elif self == ComputeDtype.BFLOAT16:
-            return torch.bfloat16
-        else:
-            raise ValueError(f"Unsupported compute dtype: {self}")
-class Dia:
-    def __init__(
-        self,
-        config: DiaConfig,
-        compute_dtype: str | ComputeDtype = ComputeDtype.FLOAT32,
-        device: torch.device | None = None,
-    ):
-        """Initializes the Dia model.
-        Args:
-            config: The configuration object for the model.
-            device: The device to load the model onto. If None, will automatically select the best available device.
-        Raises:
-            RuntimeError: If there is an error loading the DAC model.
-        """
-        super().__init__()
-        self.config = config
-        self.device = device if device is not None else _get_default_device()
-        if isinstance(compute_dtype, str):
-            compute_dtype = ComputeDtype(compute_dtype)
-        self.compute_dtype = compute_dtype.to_dtype()
-        self.model = DiaModel(config, self.compute_dtype)
-        self.dac_model = None
-    @classmethod
-    def from_local(
-        cls,
-        config_path: str,
-        checkpoint_path: str,
-        compute_dtype: str | ComputeDtype = ComputeDtype.FLOAT32,
-        device: torch.device | None = None,
-    ) -> "Dia":
-        """Loads the Dia model from local configuration and checkpoint files.
-        Args:
-            config_path: Path to the configuration JSON file.
-            checkpoint_path: Path to the model checkpoint (.pth) file.
-            device: The device to load the model onto. If None, will automatically select the best available device.
-        Returns:
-            An instance of the Dia model loaded with weights and set to eval mode.
-        Raises:
-            FileNotFoundError: If the config or checkpoint file is not found.
-            RuntimeError: If there is an error loading the checkpoint.
-        """
-        config = DiaConfig.load(config_path)
-        if config is None:
-            raise FileNotFoundError(f"Config file not found at {config_path}")
-        dia = cls(config, compute_dtype, device)
-        try:
-            state_dict = torch.load(checkpoint_path, map_location=dia.device)
-            dia.model.load_state_dict(state_dict)
-        except FileNotFoundError:
-            raise FileNotFoundError(f"Checkpoint file not found at {checkpoint_path}")
-        except Exception as e:
-            raise RuntimeError(f"Error loading checkpoint from {checkpoint_path}") from e
-        dia.model.to(dia.device)
-        dia.model.eval()
-        dia._load_dac_model()
-        return dia
-    @classmethod
-    def from_pretrained(
-        cls,
-        model_name: str = "nari-labs/Dia-1.6B",
-        compute_dtype: str | ComputeDtype = ComputeDtype.FLOAT32,
-        device: torch.device | None = None,
-    ) -> "Dia":
-        """Loads the Dia model from a Hugging Face Hub repository.
-        Downloads the configuration and checkpoint files from the specified
-        repository ID and then loads the model.
-        Args:
-            model_name: The Hugging Face Hub repository ID (e.g., "nari-labs/Dia-1.6B").
-            compute_dtype: The computation dtype to use.
-            device: The device to load the model onto. If None, will automatically select the best available device.
-        Returns:
-            An instance of the Dia model loaded with weights and set to eval mode.
-        Raises:
-            FileNotFoundError: If config or checkpoint download/loading fails.
-            RuntimeError: If there is an error loading the checkpoint.
-        """
-        if isinstance(compute_dtype, str):
-            compute_dtype = ComputeDtype(compute_dtype)
-        loaded_model = DiaModel.from_pretrained(model_name, compute_dtype=compute_dtype.to_dtype())
-        config = loaded_model.config
-        dia = cls(config, compute_dtype, device)
-        dia.model = loaded_model
-        dia.model.to(dia.device)
-        dia.model.eval()
-        dia._load_dac_model()
-        return dia
-    def _load_dac_model(self):
-        try:
-            dac_model_path = dac.utils.download()
-            dac_model = dac.DAC.load(dac_model_path).to(self.device)
-        except Exception as e:
-            raise RuntimeError("Failed to load DAC model") from e
-        self.dac_model = dac_model
-    def _prepare_text_input(self, text: str) -> torch.Tensor:
-        """Encodes text prompt, pads, and creates attention mask and positions."""
-        text_pad_value = self.config.data.text_pad_value
-        max_len = self.config.data.text_length
-        byte_text = text.encode("utf-8")
-        replaced_bytes = byte_text.replace(b"[S1]", b"\x01").replace(b"[S2]", b"\x02")
-        text_tokens = list(replaced_bytes)
-        current_len = len(text_tokens)
-        padding_needed = max_len - current_len
-        if padding_needed <= 0:
-            text_tokens = text_tokens[:max_len]
-            padded_text_np = np.array(text_tokens, dtype=np.uint8)
-        else:
-            padded_text_np = np.pad(
-                text_tokens,
-                (0, padding_needed),
-                mode="constant",
-                constant_values=text_pad_value,
-            ).astype(np.uint8)
-        src_tokens = torch.from_numpy(padded_text_np).to(torch.long).to(self.device).unsqueeze(0)  # [1, S]
-        return src_tokens
-    def _prepare_audio_prompt(self, audio_prompt: torch.Tensor | None) -> tuple[torch.Tensor, int]:
-        num_channels = self.config.data.channels
-        audio_bos_value = self.config.data.audio_bos_value
-        audio_pad_value = self.config.data.audio_pad_value
-        delay_pattern = self.config.data.delay_pattern
-        max_delay_pattern = max(delay_pattern)
-        prefill = torch.full(
-            (1, num_channels),
-            fill_value=audio_bos_value,
-            dtype=torch.int,
-            device=self.device,
-        )
-        prefill_step = 1
-        if audio_prompt is not None:
-            prefill_step += audio_prompt.shape[0]
-            prefill = torch.cat([prefill, audio_prompt], dim=0)
-        delay_pad_tensor = torch.full(
-            (max_delay_pattern, num_channels), fill_value=-1, dtype=torch.int, device=self.device
-        )
-        prefill = torch.cat([prefill, delay_pad_tensor], dim=0)
-        delay_precomp = build_delay_indices(
-            B=1,
-            T=prefill.shape[0],
-            C=num_channels,
-            delay_pattern=delay_pattern,
-        )
-        prefill = apply_audio_delay(
-            audio_BxTxC=prefill.unsqueeze(0),
-            pad_value=audio_pad_value,
-            bos_value=audio_bos_value,
-            precomp=delay_precomp,
-        ).squeeze(0)
-        return prefill, prefill_step
-    def _prepare_generation(self, text: str, audio_prompt: str | torch.Tensor | None, verbose: bool):
-        enc_input_cond = self._prepare_text_input(text)
-        enc_input_uncond = torch.zeros_like(enc_input_cond)
-        enc_input = torch.cat([enc_input_uncond, enc_input_cond], dim=0)
-        if isinstance(audio_prompt, str):
-            audio_prompt = self.load_audio(audio_prompt)
-        prefill, prefill_step = self._prepare_audio_prompt(audio_prompt)
-        if verbose:
-            print("generate: data loaded")
-        enc_state = EncoderInferenceState.new(self.config, enc_input_cond)
-        encoder_out = self.model.encoder(enc_input, enc_state)
-        dec_cross_attn_cache = self.model.decoder.precompute_cross_attn_cache(encoder_out, enc_state.positions)
-        dec_state = DecoderInferenceState.new(
-            self.config, enc_state, encoder_out, dec_cross_attn_cache, self.compute_dtype
-        )
-        dec_output = DecoderOutput.new(self.config, self.device)
-        dec_output.prefill(prefill, prefill_step)
-        dec_step = prefill_step - 1
-        if dec_step > 0:
-            dec_state.prepare_step(0, dec_step)
-            tokens_BxTxC = dec_output.get_tokens_at(0, dec_step).unsqueeze(0).expand(2, -1, -1)
-            self.model.decoder.forward(tokens_BxTxC, dec_state)
-        return dec_state, dec_output
-    def _decoder_step(
-        self,
-        tokens_Bx1xC: torch.Tensor,
-        dec_state: DecoderInferenceState,
-        cfg_scale: float,
-        temperature: float,
-        top_p: float,
-        cfg_filter_top_k: int,
-    ) -> torch.Tensor:
-        audio_eos_value = self.config.data.audio_eos_value
-        logits_Bx1xCxV = self.model.decoder.decode_step(tokens_Bx1xC, dec_state)
-        logits_last_BxCxV = logits_Bx1xCxV[:, -1, :, :]
-        uncond_logits_CxV = logits_last_BxCxV[0, :, :]
-        cond_logits_CxV = logits_last_BxCxV[1, :, :]
-        logits_CxV = cond_logits_CxV + cfg_scale * (cond_logits_CxV - uncond_logits_CxV)
-        logits_CxV[:, audio_eos_value + 1 :] = -torch.inf
-        logits_CxV[1:, audio_eos_value:] = -torch.inf
-        pred_C = _sample_next_token(
-            logits_CxV.float(),
-            temperature=temperature,
-            top_p=top_p,
-            cfg_filter_top_k=cfg_filter_top_k,
-        )
-        return pred_C
-    def _generate_output(self, generated_codes: torch.Tensor) -> np.ndarray:
-        num_channels = self.config.data.channels
-        seq_length = generated_codes.shape[0]
-        delay_pattern = self.config.data.delay_pattern
-        audio_pad_value = self.config.data.audio_pad_value
-        max_delay_pattern = max(delay_pattern)
-        revert_precomp = build_revert_indices(
-            B=1,
-            T=seq_length,
-            C=num_channels,
-            delay_pattern=delay_pattern,
-        )
-        codebook = revert_audio_delay(
-            audio_BxTxC=generated_codes.unsqueeze(0),
-            pad_value=audio_pad_value,
-            precomp=revert_precomp,
-            T=seq_length,
-        )[:, :-max_delay_pattern, :]
-        min_valid_index = 0
-        max_valid_index = 1023
-        invalid_mask = (codebook < min_valid_index) | (codebook > max_valid_index)
-        codebook[invalid_mask] = 0
-        audio = decode(self.dac_model, codebook.transpose(1, 2))
-        return audio.squeeze().cpu().numpy()
-    def load_audio(self, audio_path: str) -> torch.Tensor:
-        audio, sr = torchaudio.load(audio_path, channels_first=True)  # C, T
-        if sr != DEFAULT_SAMPLE_RATE:
-            audio = torchaudio.functional.resample(audio, sr, DEFAULT_SAMPLE_RATE)
-        audio = audio.to(self.device).unsqueeze(0)  # 1, C, T
-        audio_data = self.dac_model.preprocess(audio, DEFAULT_SAMPLE_RATE)
-        _, encoded_frame, _, _, _ = self.dac_model.encode(audio_data)  # 1, C, T
-        return encoded_frame.squeeze(0).transpose(0, 1)
-    def save_audio(self, path: str, audio: np.ndarray):
-        import soundfile as sf
-        sf.write(path, audio, DEFAULT_SAMPLE_RATE)
-    @torch.inference_mode()
-    def generate(
-        self,
-        text: str,
-        max_tokens: int | None = None,
-        cfg_scale: float = 3.0,
-        temperature: float = 1.3,
-        top_p: float = 0.95,
-        use_torch_compile: bool = False,
-        cfg_filter_top_k: int = 35,
-        audio_prompt: str | torch.Tensor | None = None,
-        audio_prompt_path: str | None = None,
-        use_cfg_filter: bool | None = None,
-        verbose: bool = False,
-    ) -> np.ndarray:
-        audio_eos_value = self.config.data.audio_eos_value
-        audio_pad_value = self.config.data.audio_pad_value
-        delay_pattern = self.config.data.delay_pattern
-        max_tokens = self.config.data.audio_length if max_tokens is None else max_tokens
-        max_delay_pattern = max(delay_pattern)
-        self.model.eval()
-        if audio_prompt_path:
-            print("Warning: audio_prompt_path is deprecated. Use audio_prompt instead.")
-            audio_prompt = audio_prompt_path
-        if use_cfg_filter is not None:
-            print("Warning: use_cfg_filter is deprecated.")
-        if verbose:
-            total_start_time = time.time()
-        dec_state, dec_output = self._prepare_generation(text, audio_prompt, verbose)
-        dec_step = dec_output.prefill_step - 1
-        bos_countdown = max_delay_pattern
-        eos_detected = False
-        eos_countdown = -1
-        if use_torch_compile:
-            step_fn = torch.compile(self._decoder_step, mode="default")
-        else:
-            step_fn = self._decoder_step
-        if verbose:
-            print("generate: starting generation loop")
-            if use_torch_compile:
-                print("generate: by using use_torch_compile=True, the first step would take long")
-            start_time = time.time()
-        while dec_step < max_tokens:
-            dec_state.prepare_step(dec_step)
-            tokens_Bx1xC = dec_output.get_tokens_at(dec_step).unsqueeze(0).expand(2, -1, -1)
-            pred_C = step_fn(
-                tokens_Bx1xC,
-                dec_state,
-                cfg_scale,
-                temperature,
-                top_p,
-                cfg_filter_top_k,
-            )
-            if (not eos_detected and pred_C[0] == audio_eos_value) or dec_step == max_tokens - max_delay_pattern - 1:
-                eos_detected = True
-                eos_countdown = max_delay_pattern
-            if eos_countdown > 0:
-                step_after_eos = max_delay_pattern - eos_countdown
-                for i, d in enumerate(delay_pattern):
-                    if step_after_eos == d:
-                        pred_C[i] = audio_eos_value
-                    elif step_after_eos > d:
-                        pred_C[i] = audio_pad_value
-                eos_countdown -= 1
-            bos_countdown = max(0, bos_countdown - 1)
-            dec_output.update_one(pred_C, dec_step + 1, bos_countdown > 0)
-            if eos_countdown == 0:
-                break
-            dec_step += 1
-            if verbose and dec_step % 86 == 0:
-                duration = time.time() - start_time
-                print(
-                    f"generate step {dec_step}: speed={86 / duration:.3f} tokens/s, realtime factor={1 / duration:.3f}x"
-                )
-                start_time = time.time()
-        if dec_output.prefill_step >= dec_step + 1:
-            print("Warning: Nothing generated")
-            return None
-        generated_codes = dec_output.generated_tokens[dec_output.prefill_step : dec_step + 1, :]
-        if verbose:
-            total_step = dec_step + 1 - dec_output.prefill_step
-            total_duration = time.time() - total_start_time
-            print(f"generate: total step={total_step}, total duration={total_duration:.3f}s")
-        return self._generate_output(generated_codes)

dia/state.py DELETED Viewed

@@ -1,207 +0,0 @@
-from dataclasses import dataclass
-import torch
-from .config import DiaConfig
-def create_attn_mask(
-    q_padding_mask_1d: torch.Tensor,
-    k_padding_mask_1d: torch.Tensor,
-    device: torch.device,
-    is_causal: bool = False,
-) -> torch.Tensor:
-    """
-    Creates the attention mask (self or cross) mimicking JAX segment ID logic.
-    """
-    B1, Tq = q_padding_mask_1d.shape
-    B2, Tk = k_padding_mask_1d.shape
-    assert B1 == B2, "Query and key batch dimensions must match"
-    p_mask_q = q_padding_mask_1d.unsqueeze(2)  # Shape [B, Tq, 1]
-    p_mask_k = k_padding_mask_1d.unsqueeze(1)  # Shape [B, 1, Tk]
-    # Condition A: Non-padding query attends to non-padding key
-    non_pad_attends_non_pad = p_mask_q & p_mask_k  # Shape [B, Tq, Tk]
-    # Condition B: Padding query attends to padding key
-    pad_attends_pad = (~p_mask_q) & (~p_mask_k)  # Shape [B, Tq, Tk]
-    # Combine: True if padding status is compatible (both non-pad OR both pad)
-    mask = non_pad_attends_non_pad | pad_attends_pad  # Shape [B, Tq, Tk]
-    if is_causal:
-        assert Tq == Tk, "Causal mask requires query and key sequence lengths to be equal"
-        causal_mask_2d = torch.tril(torch.ones((Tq, Tk), dtype=torch.bool, device=device))  # Shape [Tq, Tk]
-        causal_mask = mask & causal_mask_2d  # Shape [B, Tq, Tk]
-        return causal_mask.unsqueeze(1)  # Shape [B, 1, Tq, Tk]
-    else:
-        return mask.unsqueeze(1)  # Shape [B, 1, Tq, Tk]
-@dataclass
-class EncoderInferenceState:
-    """Parameters specifically for encoder inference."""
-    max_seq_len: int
-    device: torch.device
-    positions: torch.Tensor
-    padding_mask: torch.Tensor
-    attn_mask: torch.Tensor
-    @classmethod
-    def new(cls, config: DiaConfig, cond_src: torch.Tensor) -> "EncoderInferenceState":
-        """Creates EtorchrInferenceParams from DiaConfig and a device."""
-        device = cond_src.device
-        positions = (
-            torch.arange(config.data.text_length, dtype=torch.float32, device=device).unsqueeze(0).expand(2, -1)
-        )
-        padding_mask = (cond_src != config.data.text_pad_value).to(device).expand(2, -1)
-        attn_mask = create_attn_mask(padding_mask, padding_mask, device, is_causal=False)
-        return cls(
-            max_seq_len=config.data.text_length,
-            device=device,
-            positions=positions,
-            padding_mask=padding_mask,
-            attn_mask=attn_mask,
-        )
-class KVCache:
-    def __init__(
-        self,
-        num_heads: int,
-        max_len: int,
-        head_dim: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        k: torch.Tensor | None = None,
-        v: torch.Tensor | None = None,
-    ):
-        self.k = torch.zeros((2, num_heads, max_len, head_dim), dtype=dtype, device=device) if k is None else k
-        self.v = torch.zeros((2, num_heads, max_len, head_dim), dtype=dtype, device=device) if v is None else v
-        self.current_idx = torch.tensor(0)
-    @classmethod
-    def from_kv(cls, k: torch.Tensor, v: torch.Tensor) -> "KVCache":
-        return cls(
-            num_heads=k.shape[1],
-            max_len=k.shape[2],
-            head_dim=k.shape[3],
-            dtype=k.dtype,
-            device=k.device,
-            k=k,
-            v=v,
-        )
-    def update(self, k: torch.Tensor, v: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-        self.k[:, :, self.current_idx : self.current_idx + 1, :] = k
-        self.v[:, :, self.current_idx : self.current_idx + 1, :] = v
-        self.current_idx += 1
-        return self.k[:, :, : self.current_idx, :], self.v[:, :, : self.current_idx, :]
-    def prefill(self, k: torch.Tensor, v: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-        prefill_len = k.shape[2]
-        self.k[:, :, :prefill_len, :] = k
-        self.v[:, :, :prefill_len, :] = v
-        self.current_idx = prefill_len - 1
-@dataclass
-class DecoderInferenceState:
-    """Parameters specifically for decoder inference."""
-    device: torch.device
-    dtype: torch.dtype
-    enc_out: torch.Tensor
-    enc_positions: torch.Tensor
-    dec_positions: torch.Tensor
-    dec_cross_attn_mask: torch.Tensor
-    self_attn_cache: list[KVCache]
-    cross_attn_cache: list[KVCache]
-    @classmethod
-    def new(
-        cls,
-        config: DiaConfig,
-        enc_state: EncoderInferenceState,
-        enc_out: torch.Tensor,
-        dec_cross_attn_cache: list[KVCache],
-        compute_dtype: torch.dtype,
-    ) -> "DecoderInferenceState":
-        """Creates DecoderInferenceParams from DiaConfig and a device."""
-        device = enc_out.device
-        max_audio_len = config.data.audio_length
-        dec_positions = torch.full((2, 1), fill_value=0, dtype=torch.long, device=device)
-        tgt_padding_mask = torch.ones((2, 1), dtype=torch.bool, device=device)
-        dec_cross_attn_mask = create_attn_mask(tgt_padding_mask, enc_state.padding_mask, device, is_causal=False)
-        self_attn_cache = [
-            KVCache(
-                config.model.decoder.kv_heads,
-                max_audio_len,
-                config.model.decoder.gqa_head_dim,
-                compute_dtype,
-                device,
-            )
-            for _ in range(config.model.decoder.n_layer)
-        ]
-        return cls(
-            device=device,
-            dtype=compute_dtype,
-            enc_out=enc_out,
-            enc_positions=enc_state.positions,
-            dec_positions=dec_positions,
-            dec_cross_attn_mask=dec_cross_attn_mask,
-            self_attn_cache=self_attn_cache,
-            cross_attn_cache=dec_cross_attn_cache,
-        )
-    def prepare_step(self, step_from: int, step_to: int | None = None) -> None:
-        if step_to is None:
-            step_to = step_from + 1
-        self.dec_positions = (
-            torch.arange(step_from, step_to, dtype=torch.float32, device=self.device).unsqueeze(0).expand(2, -1)
-        )
-@dataclass
-class DecoderOutput:
-    generated_tokens: torch.Tensor
-    prefill_step: int
-    @classmethod
-    def new(cls, config: DiaConfig, device: torch.device) -> "DecoderOutput":
-        max_audio_len = config.data.audio_length
-        return cls(
-            generated_tokens=torch.full(
-                (max_audio_len, config.data.channels),
-                fill_value=-1,
-                dtype=torch.int,
-                device=device,
-            ),
-            prefill_step=0,
-        )
-    def get_tokens_at(self, step_from: int, step_to: int | None = None) -> torch.Tensor:
-        if step_to is None:
-            step_to = step_from + 1
-        return self.generated_tokens[step_from:step_to, :]
-    def update_one(self, dec_out: torch.Tensor, step: int, apply_mask: bool = False):
-        if apply_mask:
-            mask = self.generated_tokens[step : step + 1, :] == -1
-            self.generated_tokens[step : step + 1, :] = torch.where(
-                mask, dec_out, self.generated_tokens[step : step + 1, :]
-            )
-        else:
-            self.generated_tokens[step : step + 1, :] = dec_out
-    def prefill(self, dec_out: torch.Tensor, prefill_step: int):
-        length = dec_out.shape[0]
-        self.generated_tokens[0:length, :] = dec_out
-        self.prefill_step = prefill_step

dia_app_gradio.py DELETED Viewed

@@ -1,378 +0,0 @@
-import tempfile
-import time
-from pathlib import Path
-from typing import Optional, Tuple
-import spaces
-import gradio as gr
-import numpy as np
-import soundfile as sf
-import torch
-from dia.model import Dia
-# Load Nari model and config
-print("Loading Nari model...")
-try:
-    # Use the function from inference.py
-    model = Dia.from_pretrained("nari-labs/Dia-1.6B", compute_dtype="float32")
-except Exception as e:
-    print(f"Error loading Nari model: {e}")
-    raise
-@spaces.GPU
-def run_inference(
-    text_input: str,
-    audio_prompt_input: Optional[Tuple[int, np.ndarray]],
-    max_new_tokens: int,
-    cfg_scale: float,
-    temperature: float,
-    top_p: float,
-    cfg_filter_top_k: int,
-    speed_factor: float,
-):
-    """
-    Runs Nari inference using the globally loaded model and provided inputs.
-    Uses temporary files for text and audio prompt compatibility with inference.generate.
-    """
-    # global model, device  # Access global model, config, device
-    if not text_input or text_input.isspace():
-        raise gr.Error("Text input cannot be empty.")
-    temp_txt_file_path = None
-    temp_audio_prompt_path = None
-    output_audio = (44100, np.zeros(1, dtype=np.float32))
-    try:
-        prompt_path_for_generate = None
-        if audio_prompt_input is not None:
-            sr, audio_data = audio_prompt_input
-            # Check if audio_data is valid
-            if (
-                audio_data is None or audio_data.size == 0 or audio_data.max() == 0
-            ):  # Check for silence/empty
-                gr.Warning("Audio prompt seems empty or silent, ignoring prompt.")
-            else:
-                # Save prompt audio to a temporary WAV file
-                with tempfile.NamedTemporaryFile(
-                    mode="wb", suffix=".wav", delete=False
-                ) as f_audio:
-                    temp_audio_prompt_path = f_audio.name  # Store path for cleanup
-                    # Basic audio preprocessing for consistency
-                    # Convert to float32 in [-1, 1] range if integer type
-                    if np.issubdtype(audio_data.dtype, np.integer):
-                        max_val = np.iinfo(audio_data.dtype).max
-                        audio_data = audio_data.astype(np.float32) / max_val
-                    elif not np.issubdtype(audio_data.dtype, np.floating):
-                        gr.Warning(
-                            f"Unsupported audio prompt dtype {audio_data.dtype}, attempting conversion."
-                        )
-                        # Attempt conversion, might fail for complex types
-                        try:
-                            audio_data = audio_data.astype(np.float32)
-                        except Exception as conv_e:
-                            raise gr.Error(
-                                f"Failed to convert audio prompt to float32: {conv_e}"
-                            )
-                    # Ensure mono (average channels if stereo)
-                    if audio_data.ndim > 1:
-                        if audio_data.shape[0] == 2:  # Assume (2, N)
-                            audio_data = np.mean(audio_data, axis=0)
-                        elif audio_data.shape[1] == 2:  # Assume (N, 2)
-                            audio_data = np.mean(audio_data, axis=1)
-                        else:
-                            gr.Warning(
-                                f"Audio prompt has unexpected shape {audio_data.shape}, taking first channel/axis."
-                            )
-                            audio_data = (
-                                audio_data[0]
-                                if audio_data.shape[0] < audio_data.shape[1]
-                                else audio_data[:, 0]
-                            )
-                        audio_data = np.ascontiguousarray(
-                            audio_data
-                        )  # Ensure contiguous after slicing/mean
-                    # Write using soundfile
-                    try:
-                        sf.write(
-                            temp_audio_prompt_path, audio_data, sr, subtype="FLOAT"
-                        )  # Explicitly use FLOAT subtype
-                        prompt_path_for_generate = temp_audio_prompt_path
-                        print(
-                            f"Created temporary audio prompt file: {temp_audio_prompt_path} (orig sr: {sr})"
-                        )
-                    except Exception as write_e:
-                        print(f"Error writing temporary audio file: {write_e}")
-                        raise gr.Error(f"Failed to save audio prompt: {write_e}")
-        # 3. Run Generation
-        start_time = time.time()
-        # Use torch.inference_mode() context manager for the generation call
-        with torch.inference_mode():
-            output_audio_np = model.generate(
-                text_input,
-                max_tokens=max_new_tokens,
-                cfg_scale=cfg_scale,
-                temperature=temperature,
-                top_p=top_p,
-                cfg_filter_top_k=cfg_filter_top_k,  # Pass the value here
-                use_torch_compile=False,  # Keep False for Gradio stability
-                audio_prompt=prompt_path_for_generate,
-            )
-        end_time = time.time()
-        print(f"Generation finished in {end_time - start_time:.2f} seconds.")
-        # 4. Convert Codes to Audio
-        if output_audio_np is not None:
-            # Get sample rate from the loaded DAC model
-            output_sr = 44100
-            # --- Slow down audio ---
-            original_len = len(output_audio_np)
-            # Ensure speed_factor is positive and not excessively small/large to avoid issues
-            speed_factor = max(0.1, min(speed_factor, 5.0))
-            target_len = int(
-                original_len / speed_factor
-            )  # Target length based on speed_factor
-            if (
-                target_len != original_len and target_len > 0
-            ):  # Only interpolate if length changes and is valid
-                x_original = np.arange(original_len)
-                x_resampled = np.linspace(0, original_len - 1, target_len)
-                resampled_audio_np = np.interp(x_resampled, x_original, output_audio_np)
-                output_audio = (
-                    output_sr,
-                    resampled_audio_np.astype(np.float32),
-                )  # Use resampled audio
-                print(
-                    f"Resampled audio from {original_len} to {target_len} samples for {speed_factor:.2f}x speed."
-                )
-            else:
-                output_audio = (
-                    output_sr,
-                    output_audio_np,
-                )  # Keep original if calculation fails or no change
-                print(f"Skipping audio speed adjustment (factor: {speed_factor:.2f}).")
-            # --- End slowdown ---
-            print(
-                f"Audio conversion successful. Final shape: {output_audio[1].shape}, Sample Rate: {output_sr}"
-            )
-            # Explicitly convert to int16 to prevent Gradio warning
-            if (
-                output_audio[1].dtype == np.float32
-                or output_audio[1].dtype == np.float64
-            ):
-                audio_for_gradio = np.clip(output_audio[1], -1.0, 1.0)
-                audio_for_gradio = (audio_for_gradio * 32767).astype(np.int16)
-                output_audio = (output_sr, audio_for_gradio)
-                print("Converted audio to int16 for Gradio output.")
-        else:
-            print("\nGeneration finished, but no valid tokens were produced.")
-            # Return default silence
-            gr.Warning("Generation produced no output.")
-    except Exception as e:
-        print(f"Error during inference: {e}")
-        import traceback
-        traceback.print_exc()
-        # Re-raise as Gradio error to display nicely in the UI
-        raise gr.Error(f"Inference failed: {e}")
-    finally:
-        # 5. Cleanup Temporary Files defensively
-        if temp_txt_file_path and Path(temp_txt_file_path).exists():
-            try:
-                Path(temp_txt_file_path).unlink()
-                print(f"Deleted temporary text file: {temp_txt_file_path}")
-            except OSError as e:
-                print(
-                    f"Warning: Error deleting temporary text file {temp_txt_file_path}: {e}"
-                )
-        if temp_audio_prompt_path and Path(temp_audio_prompt_path).exists():
-            try:
-                Path(temp_audio_prompt_path).unlink()
-                print(f"Deleted temporary audio prompt file: {temp_audio_prompt_path}")
-            except OSError as e:
-                print(
-                    f"Warning: Error deleting temporary audio prompt file {temp_audio_prompt_path}: {e}"
-                )
-    return output_audio
-# --- Create Gradio Interface ---
-css = """
-#col-container {max-width: 90%; margin-left: auto; margin-right: auto;}
-"""
-# Attempt to load default text from example.txt
-default_text = "[S1] Dia is an open weights text to dialogue model. \n[S2] You get full control over scripts and voices. \n[S1] Wow. Amazing. (laughs) \n[S2] Try it now on Git hub or Hugging Face."
-example_txt_path = Path("./example.txt")
-if example_txt_path.exists():
-    try:
-        default_text = example_txt_path.read_text(encoding="utf-8").strip()
-        if not default_text:  # Handle empty example file
-            default_text = "Example text file was empty."
-    except Exception as e:
-        print(f"Warning: Could not read example.txt: {e}")
-# Build Gradio UI
-with gr.Blocks(css=css) as demo:
-    gr.Markdown("# Nari Text-to-Speech Synthesis")
-    with gr.Row(equal_height=False):
-        with gr.Column(scale=1):
-            text_input = gr.Textbox(
-                label="Input Text",
-                placeholder="Enter text here...",
-                value=default_text,
-                lines=5,  # Increased lines
-            )
-            audio_prompt_input = gr.Audio(
-                label="Audio Prompt (Optional)",
-                show_label=True,
-                sources=["upload", "microphone"],
-                type="numpy",
-            )
-            with gr.Accordion("Generation Parameters", open=False):
-                max_new_tokens = gr.Slider(
-                    label="Max New Tokens (Audio Length)",
-                    minimum=860,
-                    maximum=3072,
-                    value=model.config.data.audio_length,  # Use config default if available, else fallback
-                    step=50,
-                    info="Controls the maximum length of the generated audio (more tokens = longer audio).",
-                )
-                cfg_scale = gr.Slider(
-                    label="CFG Scale (Guidance Strength)",
-                    minimum=1.0,
-                    maximum=5.0,
-                    value=3.0,  # Default from inference.py
-                    step=0.1,
-                    info="Higher values increase adherence to the text prompt.",
-                )
-                temperature = gr.Slider(
-                    label="Temperature (Randomness)",
-                    minimum=1.0,
-                    maximum=1.5,
-                    value=1.3,  # Default from inference.py
-                    step=0.05,
-                    info="Lower values make the output more deterministic, higher values increase randomness.",
-                )
-                top_p = gr.Slider(
-                    label="Top P (Nucleus Sampling)",
-                    minimum=0.80,
-                    maximum=1.0,
-                    value=0.95,  # Default from inference.py
-                    step=0.01,
-                    info="Filters vocabulary to the most likely tokens cumulatively reaching probability P.",
-                )
-                cfg_filter_top_k = gr.Slider(
-                    label="CFG Filter Top K",
-                    minimum=15,
-                    maximum=50,
-                    value=30,
-                    step=1,
-                    info="Top k filter for CFG guidance.",
-                )
-                speed_factor_slider = gr.Slider(
-                    label="Speed Factor",
-                    minimum=0.8,
-                    maximum=1.0,
-                    value=0.94,
-                    step=0.02,
-                    info="Adjusts the speed of the generated audio (1.0 = original speed).",
-                )
-            run_button = gr.Button("Generate Audio", variant="primary")
-        with gr.Column(scale=1):
-            audio_output = gr.Audio(
-                label="Generated Audio",
-                type="numpy",
-                autoplay=False,
-            )
-    # Link button click to function
-    run_button.click(
-        fn=run_inference,
-        inputs=[
-            text_input,
-            audio_prompt_input,
-            max_new_tokens,
-            cfg_scale,
-            temperature,
-            top_p,
-            cfg_filter_top_k,
-            speed_factor_slider,
-        ],
-        outputs=[audio_output],  # Add status_output here if using it
-        api_name="generate_audio",
-    )
-    # Add examples (ensure the prompt path is correct or remove it if example file doesn't exist)
-    example_prompt_path = "./example_prompt.mp3"  # Adjust if needed
-    examples_list = [
-        [
-            "[S1] Oh fire! Oh my goodness! What's the procedure? What to we do people? The smoke could be coming through an air duct! \n[S2] Oh my god! Okay.. it's happening. Everybody stay calm! \n[S1] What's the procedure... \n[S2] Everybody stay fucking calm!!!... Everybody fucking calm down!!!!! \n[S1] No! No! If you touch the handle, if its hot there might be a fire down the hallway! ",
-            None,
-            3072,
-            3.0,
-            1.3,
-            0.95,
-            35,
-            0.94,
-        ],
-        [
-            "[S1] Open weights text to dialogue model. \n[S2] You get full control over scripts and voices. \n[S1] I'm biased, but I think we clearly won. \n[S2] Hard to disagree. (laughs) \n[S1] Thanks for listening to this demo. \n[S2] Try it now on Git hub and Hugging Face. \n[S1] If you liked our model, please give us a star and share to your friends. \n[S2] This was Nari Labs.",
-            example_prompt_path if Path(example_prompt_path).exists() else None,
-            3072,
-            3.0,
-            1.3,
-            0.95,
-            35,
-            0.94,
-        ],
-    ]
-    if examples_list:
-        gr.Examples(
-            examples=examples_list,
-            inputs=[
-                text_input,
-                audio_prompt_input,
-                max_new_tokens,
-                cfg_scale,
-                temperature,
-                top_p,
-                cfg_filter_top_k,
-                speed_factor_slider,
-            ],
-            outputs=[audio_output],
-            fn=run_inference,
-            cache_examples=False,
-            label="Examples (Click to Run)",
-        )
-    else:
-        gr.Markdown("_(No examples configured or example prompt file missing)_")
-# --- Launch the App ---
-if __name__ == "__main__":
-    print("Launching Gradio interface...")
-    # set `GRADIO_SERVER_NAME`, `GRADIO_SERVER_PORT` env vars to override default values
-    # use `GRADIO_SERVER_NAME=0.0.0.0` for Docker
-    demo.launch()

utils/tts.py CHANGED Viewed

@@ -1,72 +1,125 @@
 import logging
 # Configure logging
 logger = logging.getLogger(__name__)
-# Import the factory pattern implementation
-from utils.tts_factory import TTSFactory
-# Import base classes
-from utils.tts_base import TTSEngineBase, DummyTTSEngine
-# Import engine-specific modules
-from utils.tts_engines import (
-    get_available_engines,
-    create_engine,
-    KokoroTTSEngine,
-    KokoroSpaceTTSEngine,
-    DiaTTSEngine
-)
-# Import legacy functions for backward compatibility
-from utils.tts_kokoro import generate_speech as kokoro_generate_speech
-from utils.tts_kokoro_space import generate_speech as kokoro_space_generate_speech
-from utils.tts_dia import generate_speech as dia_generate_speech
-# Convenience function to get the best available TTS engine
-def get_best_engine(lang_code: str = 'z') -> TTSEngineBase:
-    """Get the best available TTS engine
     Args:
         lang_code (str): Language code for the engine
     Returns:
-        TTSEngineBase: An instance of the best available TTS engine
     """
-    return TTSFactory.create_engine(None, lang_code)
-# Function to get a TTS engine instance (for backward compatibility)
-def get_tts_engine(engine_type: str = None, lang_code: str = 'z') -> TTSEngineBase:
-    """Get a TTS engine instance
-    This function is maintained for backward compatibility with app.py.
-    New code should use the factory pattern implementation directly.
     Args:
-        engine_type (str, optional): Type of engine to create ('kokoro', 'kokoro_space', 'dia', 'dummy')
-                                    If None, the best available engine will be used
-        lang_code (str): Language code for the engine
     Returns:
-        TTSEngineBase: An instance of a TTS engine
     """
-    return TTSFactory.create_engine(engine_type, lang_code)
-# Legacy function for backward compatibility
-def generate_speech(text: str, language: str = "z", voice: str = "af_heart", speed: float = 1.0) -> str:
-    """Generate speech using the best available TTS engine
-    This is a legacy function maintained for backward compatibility.
-    New code should use the factory pattern implementation directly.
     Args:
         text (str): Input text to synthesize
-        language (str): Language code
         voice (str): Voice ID to use
         speed (float): Speech speed multiplier
-    Returns:
-        str: Path to the generated audio file
     """
-    engine = get_best_engine(language)
-    return engine.generate_speech(text, voice, speed)

 import logging
+from typing import Optional, Generator, Tuple, List, Dict, Any
+import numpy as np
+# Import the base class and dummy implementation
+from utils.tts_simplified import TTSBase, DummyTTS
+# Import the specific TTS implementations
+from utils.tts_kokoro_simplified import KokoroTTS, KOKORO_AVAILABLE
+from utils.tts_dia_simplified import DiaTTS, DIA_AVAILABLE
+from utils.tts_cosyvoice2_simplified import CosyVoice2TTS, COSYVOICE2_AVAILABLE
 # Configure logging
 logger = logging.getLogger(__name__)
+def get_available_engines() -> List[str]:
+    """Get a list of available TTS engines
+    Returns:
+        List[str]: List of available engine names
+    """
+    available = []
+    if KOKORO_AVAILABLE:
+        available.append('kokoro')
+    if DIA_AVAILABLE:
+        available.append('dia')
+    if COSYVOICE2_AVAILABLE:
+        available.append('cosyvoice2')
+    # Dummy is always available
+    available.append('dummy')
+    return available
+def get_tts_engine(engine_type: Optional[str] = None, lang_code: str = 'z') -> TTSBase:
+    """Get a TTS engine instance
     Args:
+        engine_type (str, optional): Type of engine to create ('kokoro', 'dia', 'cosyvoice2', 'dummy')
+                                    If None, the best available engine will be used
         lang_code (str): Language code for the engine
     Returns:
+        TTSBase: An instance of a TTS engine
     """
+    # Get available engines
+    available_engines = get_available_engines()
+    logger.info(f"Available TTS engines: {available_engines}")
+    # If engine_type is specified, try to create that specific engine
+    if engine_type is not None:
+        if engine_type == 'kokoro' and KOKORO_AVAILABLE:
+            logger.info("Creating Kokoro TTS engine")
+            return KokoroTTS(lang_code)
+        elif engine_type == 'dia' and DIA_AVAILABLE:
+            logger.info("Creating Dia TTS engine")
+            return DiaTTS(lang_code)
+        elif engine_type == 'cosyvoice2' and COSYVOICE2_AVAILABLE:
+            logger.info("Creating CosyVoice2 TTS engine")
+            return CosyVoice2TTS(lang_code)
+        elif engine_type == 'dummy':
+            logger.info("Creating Dummy TTS engine")
+            return DummyTTS(lang_code)
+        else:
+            logger.warning(f"Requested engine '{engine_type}' is not available")
+    # If no specific engine is requested or the requested engine is not available,
+    # use the best available engine based on priority
+    priority_order = ['cosyvoice2', 'kokoro', 'dia', 'dummy']
+    for engine in priority_order:
+        if engine in available_engines:
+            logger.info(f"Using best available engine: {engine}")
+            if engine == 'kokoro':
+                return KokoroTTS(lang_code)
+            elif engine == 'dia':
+                return DiaTTS(lang_code)
+            elif engine == 'cosyvoice2':
+                return CosyVoice2TTS(lang_code)
+            elif engine == 'dummy':
+                return DummyTTS(lang_code)
+    # Fallback to dummy engine if no engines are available
+    logger.warning("No TTS engines available, falling back to dummy engine")
+    return DummyTTS(lang_code)
+def generate_speech(text: str, engine_type: Optional[str] = None, lang_code: str = 'z',
+                   voice: str = 'default', speed: float = 1.0) -> Optional[str]:
+    """Generate speech using the specified or best available TTS engine
     Args:
+        text (str): Input text to synthesize
+        engine_type (str, optional): Type of engine to use
+        lang_code (str): Language code
+        voice (str): Voice ID to use
+        speed (float): Speech speed multiplier
     Returns:
+        Optional[str]: Path to the generated audio file or None if generation fails
     """
+    engine = get_tts_engine(engine_type, lang_code)
+    return engine.generate_speech(text, voice, speed)
+def generate_speech_stream(text: str, engine_type: Optional[str] = None, lang_code: str = 'z',
+                          voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
+    """Generate speech stream using the specified or best available TTS engine
     Args:
         text (str): Input text to synthesize
+        engine_type (str, optional): Type of engine to use
+        lang_code (str): Language code
         voice (str): Voice ID to use
         speed (float): Speech speed multiplier
+    Yields:
+        tuple: (sample_rate, audio_data) pairs for each segment
     """
+    engine = get_tts_engine(engine_type, lang_code)
+    yield from engine.generate_speech_stream(text, voice, speed)

utils/tts_README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+# TTS Structure
+This directory contains a Text-to-Speech (TTS) implementation that supports three specific models:
+1. Kokoro: https://github.com/hexgrad/kokoro
+2. Dia: https://github.com/nari-labs/dia
+3. CosyVoice2: https://github.com/nari-labs/dia
+## Structure
+The TTS implementation follows a simple, clean structure:
+- `tts.py`: Contains the base `TTSBase` abstract class and `DummyTTS` implementation
+- `tts_kokoro.py`: Kokoro TTS implementation
+- `tts_dia.py`: Dia TTS implementation
+- `tts_cosyvoice2.py`: CosyVoice2 TTS implementation
+- `tts_main.py`: Main entry point for TTS functionality
+## Usage
+```python
+# Import the main TTS functions
+from utils.tts_main import generate_speech, generate_speech_stream, get_tts_engine
+# Generate speech using the best available engine
+audio_path = generate_speech("Hello, world!")
+# Generate speech using a specific engine
+audio_path = generate_speech("Hello, world!", engine_type="kokoro")
+# Generate speech with specific parameters
+audio_path = generate_speech(
+    "Hello, world!",
+    engine_type="dia",
+    lang_code="en",
+    voice="default",
+    speed=1.0
+)
+# Generate speech stream
+for sample_rate, audio_data in generate_speech_stream("Hello, world!"):
+    # Process audio data
+    pass
+# Get a specific TTS engine instance
+engine = get_tts_engine("kokoro")
+audio_path = engine.generate_speech("Hello, world!")
+```
+## Error Handling
+All TTS implementations include robust error handling:
+1. Each implementation checks for the availability of its dependencies
+2. If a specific engine fails, it automatically falls back to the `DummyTTS` implementation
+3. The main module prioritizes engines based on availability
+## Adding New Engines
+To add a new TTS engine:
+1. Create a new file `tts_<engine_name>.py`
+2. Implement a class that inherits from `TTSBase`
+3. Add the engine to the available engines list in `tts_main.py`

utils/tts_base.py CHANGED Viewed

@@ -1,50 +1,46 @@
 import os
 import time
-import logging
-import soundfile as sf
 import numpy as np
 from abc import ABC, abstractmethod
-from typing import Tuple, Generator, Optional
 # Configure logging
 logger = logging.getLogger(__name__)
-class TTSEngineBase(ABC):
     """Base class for all TTS engines
     This abstract class defines the interface that all TTS engines must implement.
-    It also provides common utility methods for file handling and audio generation.
     """
     def __init__(self, lang_code: str = 'z'):
         """Initialize the TTS engine
         Args:
-            lang_code (str): Language code ('a' for US English, 'b' for British English,
-                           'j' for Japanese, 'z' for Mandarin Chinese)
-                           Note: Not all engines support all language codes
         """
         self.lang_code = lang_code
-        logger.info(f"Initializing {self.__class__.__name__} with language code: {lang_code}")
     @abstractmethod
-    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Optional[str]:
         """Generate speech from text
         Args:
             text (str): Input text to synthesize
-            voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
-                         Note: Not all engines support all voices
-            speed (float): Speech speed multiplier (0.5 to 2.0)
-                           Note: Not all engines support speed adjustment
         Returns:
-            Optional[str]: Path to the generated audio file, or None if generation fails
         """
         pass
-    def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
-        """Generate speech from text and yield each segment
         Args:
             text (str): Input text to synthesize
@@ -54,93 +50,75 @@ class TTSEngineBase(ABC):
         Yields:
             tuple: (sample_rate, audio_data) pairs for each segment
         """
-        # Default implementation: generate full audio and yield as a single chunk
-        output_path = self.generate_speech(text, voice, speed)
-        audio_data, sample_rate = sf.read(output_path)
-        yield sample_rate, audio_data
-    def _create_output_dir(self) -> str:
-        """Create output directory for audio files
-        Returns:
-            str: Path to the output directory
-        """
-        output_dir = "temp/outputs"
-        os.makedirs(output_dir, exist_ok=True)
-        return output_dir
-    def _generate_output_path(self, prefix: str = "output") -> str:
-        """Generate a unique output path for audio files
         Args:
-            prefix (str): Prefix for the output filename
         Returns:
             str: Path to the output file
         """
-        output_dir = self._create_output_dir()
-        timestamp = int(time.time())
-        return f"{output_dir}/{prefix}_{timestamp}.wav"
-class DummyTTSEngine(TTSEngineBase):
-    """Dummy TTS engine that generates a simple sine wave
-    This engine is used as a fallback when no other engines are available.
     """
-    def __init__(self, lang_code: str = 'z'):
-        super().__init__(lang_code)
-        logger.warning("Using dummy TTS implementation as no other engines are available")
-    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
-        """Generate a dummy audio file with a simple sine wave
         Args:
             text (str): Input text (not used)
             voice (str): Voice ID (not used)
-            speed (float): Speed multiplier (not used)
         Returns:
-            str: Path to the generated dummy audio file
         """
         logger.info(f"Generating dummy speech for text length: {len(text)}")
-        # Generate unique output path
-        output_path = self._generate_output_path("dummy")
         # Generate a simple sine wave
         sample_rate = 24000
-        duration = 3.0  # seconds
-        t = np.linspace(0, duration, int(sample_rate * duration), False)
-        tone = np.sin(2 * np.pi * 440 * t) * 0.3
-        # Save the audio file
-        logger.info(f"Saving dummy audio to {output_path}")
-        sf.write(output_path, tone, sample_rate)
-        logger.info(f"Dummy audio generation complete: {output_path}")
         return output_path
-    def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
-        """Generate dummy audio chunks with simple sine waves
         Args:
             text (str): Input text (not used)
             voice (str): Voice ID (not used)
-            speed (float): Speed multiplier (not used)
         Yields:
-            tuple: (sample_rate, audio_data) pairs for each dummy segment
         """
         logger.info(f"Generating dummy speech stream for text length: {len(text)}")
         sample_rate = 24000
-        duration = 1.0  # seconds per chunk
-        # Create 3 chunks of dummy audio
-        for i in range(3):
-            t = np.linspace(0, duration, int(sample_rate * duration), False)
-            freq = 440 + (i * 220)  # Different frequency for each chunk
-            tone = np.sin(2 * np.pi * freq * t) * 0.3
-            yield sample_rate, tone

+import logging
 import os
 import time
 import numpy as np
+import soundfile as sf
+from typing import Optional, Generator, Tuple, List
 from abc import ABC, abstractmethod
 # Configure logging
 logger = logging.getLogger(__name__)
+class TTSBase(ABC):
     """Base class for all TTS engines
     This abstract class defines the interface that all TTS engines must implement.
     """
     def __init__(self, lang_code: str = 'z'):
         """Initialize the TTS engine
         Args:
+            lang_code (str): Language code for the engine
         """
         self.lang_code = lang_code
     @abstractmethod
+    def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
         """Generate speech from text
         Args:
             text (str): Input text to synthesize
+            voice (str): Voice ID to use
+            speed (float): Speech speed multiplier
         Returns:
+            Optional[str]: Path to the generated audio file or None if generation fails
         """
         pass
+    @abstractmethod
+    def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
+        """Generate speech stream from text
         Args:
             text (str): Input text to synthesize
         Yields:
             tuple: (sample_rate, audio_data) pairs for each segment
         """
+        pass
+    def _generate_output_path(self, prefix: str = "tts", extension: str = "wav") -> str:
+        """Generate a unique output path for the audio file
         Args:
+            prefix (str): Prefix for the filename
+            extension (str): File extension
         Returns:
             str: Path to the output file
         """
+        timestamp = int(time.time() * 1000)
+        filename = f"{prefix}_{timestamp}.{extension}"
+        output_dir = os.path.join(os.getcwd(), "output")
+        os.makedirs(output_dir, exist_ok=True)
+        return os.path.join(output_dir, filename)
+class DummyTTS(TTSBase):
+    """Dummy TTS engine that generates sine wave audio
+    This class is used as a fallback when no other TTS engine is available.
     """
+    def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> str:
+        """Generate a dummy sine wave audio file
         Args:
             text (str): Input text (not used)
             voice (str): Voice ID (not used)
+            speed (float): Speech speed multiplier (not used)
         Returns:
+            str: Path to the generated audio file
         """
         logger.info(f"Generating dummy speech for text length: {len(text)}")
         # Generate a simple sine wave
         sample_rate = 24000
+        duration = min(len(text) / 20, 10)  # Rough approximation of speech duration
+        t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
+        audio = 0.5 * np.sin(2 * np.pi * 440 * t)  # 440 Hz sine wave
+        # Save to file
+        output_path = self._generate_output_path(prefix="dummy")
+        sf.write(output_path, audio, sample_rate)
+        logger.info(f"Generated dummy audio: {output_path}")
         return output_path
+    def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
+        """Generate a dummy sine wave audio stream
         Args:
             text (str): Input text (not used)
             voice (str): Voice ID (not used)
+            speed (float): Speech speed multiplier (not used)
         Yields:
+            tuple: (sample_rate, audio_data) pairs
         """
         logger.info(f"Generating dummy speech stream for text length: {len(text)}")
+        # Generate a simple sine wave
         sample_rate = 24000
+        duration = min(len(text) / 20, 10)  # Rough approximation of speech duration
+        t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
+        audio = 0.5 * np.sin(2 * np.pi * 440 * t)  # 440 Hz sine wave
+        # Yield the audio data
+        yield sample_rate, audio

utils/tts_cascading.py DELETED Viewed

@@ -1,112 +0,0 @@
-import logging
-from typing import List, Tuple, Generator, Optional
-import numpy as np
-from utils.tts_base import TTSEngineBase, DummyTTSEngine
-from utils.tts_engines import create_engine
-# Configure logging
-logger = logging.getLogger(__name__)
-class CascadingTTSEngine(TTSEngineBase):
-    """Cascading TTS engine implementation
-    This engine tries multiple TTS engines in order until one succeeds.
-    It provides a fallback mechanism to maximize the chances of getting
-    quality speech output.
-    """
-    def __init__(self, engine_types: List[str], lang_code: str = 'z'):
-        """Initialize the cascading TTS engine
-        Args:
-            engine_types (List[str]): List of engine types to try in order
-            lang_code (str): Language code for the engines
-        """
-        super().__init__(lang_code)
-        self.engine_types = engine_types
-        self.lang_code = lang_code
-        logger.info(f"Initialized cascading TTS engine with engines: {engine_types}")
-    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
-        """Generate speech by trying multiple engines in order
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice ID to use
-            speed (float): Speech speed multiplier
-        Returns:
-            str: Path to the generated audio file
-        """
-        logger.info(f"Generating speech with cascading engine for text length: {len(text)}")
-        # Try each engine in order
-        for engine_type in self.engine_types:
-            try:
-                logger.info(f"Trying TTS engine: {engine_type}")
-                engine = create_engine(engine_type, self.lang_code)
-                # Generate speech with the current engine
-                result = engine.generate_speech(text, voice, speed)
-                # If the engine returned a valid result, return it
-                if result is not None:
-                    logger.info(f"Successfully generated speech with {engine_type}")
-                    return result
-                logger.warning(f"TTS engine {engine_type} failed to generate speech, trying next engine")
-            except Exception as e:
-                logger.error(f"Error with TTS engine {engine_type}: {str(e)}")
-                logger.error(f"Error type: {type(e).__name__}")
-                logger.warning(f"Trying next TTS engine")
-        # If all engines failed, fall back to dummy engine
-        logger.warning("All TTS engines failed, falling back to dummy engine")
-        return DummyTTSEngine(self.lang_code).generate_speech(text, voice, speed)
-    def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
-        """Generate speech stream by trying multiple engines in order
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice ID to use
-            speed (float): Speech speed multiplier
-        Yields:
-            tuple: (sample_rate, audio_data) pairs for each segment
-        """
-        logger.info(f"Generating speech stream with cascading engine for text length: {len(text)}")
-        # Try each engine in order
-        for engine_type in self.engine_types:
-            try:
-                logger.info(f"Trying TTS engine for streaming: {engine_type}")
-                engine = create_engine(engine_type, self.lang_code)
-                # Create a generator for the current engine
-                generator = engine.generate_speech_stream(text, voice, speed)
-                # Try to get the first chunk to verify the engine works
-                first_chunk = next(generator, None)
-                if first_chunk is not None:
-                    # Engine produced a valid first chunk, yield it and continue with this engine
-                    logger.info(f"Successfully started speech stream with {engine_type}")
-                    yield first_chunk
-                    # Yield the rest of the chunks from this engine
-                    for chunk in generator:
-                        yield chunk
-                    # Successfully streamed all chunks, return
-                    return
-                logger.warning(f"TTS engine {engine_type} failed to generate speech stream, trying next engine")
-            except Exception as e:
-                logger.error(f"Error with TTS engine {engine_type} streaming: {str(e)}")
-                logger.error(f"Error type: {type(e).__name__}")
-                logger.warning(f"Trying next TTS engine for streaming")
-        # If all engines failed, fall back to dummy engine
-        logger.warning("All TTS engines failed for streaming, falling back to dummy engine")
-        yield from DummyTTSEngine(self.lang_code).generate_speech_stream(text, voice, speed)

utils/tts_cosyvoice2.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import logging
+import numpy as np
+import soundfile as sf
+from typing import Optional, Generator, Tuple
+from utils.tts_simplified import TTSBase, DummyTTS
+# Configure logging
+logger = logging.getLogger(__name__)
+# Flag to track CosyVoice2 availability
+COSYVOICE2_AVAILABLE = False
+DEFAULT_SAMPLE_RATE = 24000
+# Try to import CosyVoice2 dependencies
+try:
+    import torch
+    # Import CosyVoice2 - assuming it's installed and has a similar API to Dia
+    # since they're both from nari-labs according to the GitHub link
+    from cosyvoice2.model import CosyVoice2
+    COSYVOICE2_AVAILABLE = True
+    logger.info("CosyVoice2 TTS engine is available")
+except ImportError:
+    logger.warning("CosyVoice2 TTS engine is not available")
+except ModuleNotFoundError as e:
+    logger.warning(f"CosyVoice2 TTS engine is not available: {str(e)}")
+    COSYVOICE2_AVAILABLE = False
+def _get_model():
+    """Lazy-load the CosyVoice2 model
+    Returns:
+        CosyVoice2 or None: The CosyVoice2 model or None if not available
+    """
+    if not COSYVOICE2_AVAILABLE:
+        logger.warning("CosyVoice2 TTS engine is not available")
+        return None
+    try:
+        import torch
+        from cosyvoice2.model import CosyVoice2
+        # Initialize the model
+        model = CosyVoice2.from_pretrained()
+        logger.info("CosyVoice2 model successfully loaded")
+        return model
+    except ImportError as e:
+        logger.error(f"Failed to import CosyVoice2 dependencies: {str(e)}")
+        return None
+    except FileNotFoundError as e:
+        logger.error(f"Failed to load CosyVoice2 model files: {str(e)}")
+        return None
+    except Exception as e:
+        logger.error(f"Failed to initialize CosyVoice2 model: {str(e)}")
+        return None
+class CosyVoice2TTS(TTSBase):
+    """CosyVoice2 TTS engine implementation
+    This engine uses the CosyVoice2 model for TTS generation.
+    """
+    def __init__(self, lang_code: str = 'z'):
+        """Initialize the CosyVoice2 TTS engine
+        Args:
+            lang_code (str): Language code for the engine
+        """
+        super().__init__(lang_code)
+        self.model = None
+    def _ensure_model(self):
+        """Ensure the model is loaded
+        Returns:
+            bool: True if model is available, False otherwise
+        """
+        if self.model is None:
+            self.model = _get_model()
+        return self.model is not None
+    def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
+        """Generate speech using CosyVoice2 TTS engine
+        Args:
+            text (str): Input text to synthesize
+            voice (str): Voice ID (may not be used in CosyVoice2)
+            speed (float): Speech speed multiplier (may not be used in CosyVoice2)
+        Returns:
+            Optional[str]: Path to the generated audio file or None if generation fails
+        """
+        logger.info(f"Generating speech with CosyVoice2 for text length: {len(text)}")
+        # Check if CosyVoice2 is available
+        if not COSYVOICE2_AVAILABLE:
+            logger.warning("CosyVoice2 TTS engine is not available, falling back to dummy TTS")
+            return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
+        # Ensure model is loaded
+        if not self._ensure_model():
+            logger.warning("Failed to load CosyVoice2 model, falling back to dummy TTS")
+            return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
+        try:
+            import torch
+            # Generate unique output path
+            output_path = self._generate_output_path(prefix="cosyvoice2")
+            # Generate audio
+            with torch.inference_mode():
+                # Assuming CosyVoice2 has a similar API to Dia
+                output_audio_np = self.model.generate(
+                    text,
+                    max_tokens=None,
+                    cfg_scale=3.0,
+                    temperature=1.3,
+                    top_p=0.95,
+                    use_torch_compile=False,
+                    verbose=False
+                )
+            if output_audio_np is not None:
+                logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
+                sf.write(output_path, output_audio_np, DEFAULT_SAMPLE_RATE)
+                logger.info(f"CosyVoice2 audio generation complete: {output_path}")
+                return output_path
+            else:
+                logger.warning("CosyVoice2 model returned None for audio output")
+                logger.warning("Falling back to dummy TTS")
+                return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
+        except Exception as e:
+            logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
+            logger.warning("CosyVoice2 TTS engine failed, falling back to dummy TTS")
+            return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
+    def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
+        """Generate speech stream using CosyVoice2 TTS engine
+        Args:
+            text (str): Input text to synthesize
+            voice (str): Voice ID (may not be used in CosyVoice2)
+            speed (float): Speech speed multiplier (may not be used in CosyVoice2)
+        Yields:
+            tuple: (sample_rate, audio_data) pairs for each segment
+        """
+        logger.info(f"Generating speech stream with CosyVoice2 for text length: {len(text)}")
+        # Check if CosyVoice2 is available
+        if not COSYVOICE2_AVAILABLE:
+            logger.warning("CosyVoice2 TTS engine is not available, falling back to dummy TTS")
+            yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
+            return
+        # Ensure model is loaded
+        if not self._ensure_model():
+            logger.warning("Failed to load CosyVoice2 model, falling back to dummy TTS")
+            yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
+            return
+        try:
+            import torch
+            # Generate audio
+            with torch.inference_mode():
+                # Assuming CosyVoice2 has a similar API to Dia
+                output_audio_np = self.model.generate(
+                    text,
+                    max_tokens=None,
+                    cfg_scale=3.0,
+                    temperature=1.3,
+                    top_p=0.95,
+                    use_torch_compile=False,
+                    verbose=False
+                )
+            if output_audio_np is not None:
+                logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
+                yield DEFAULT_SAMPLE_RATE, output_audio_np
+            else:
+                logger.warning("CosyVoice2 model returned None for audio output")
+                logger.warning("Falling back to dummy TTS")
+                yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
+        except Exception as e:
+            logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
+            logger.warning("CosyVoice2 TTS engine failed, falling back to dummy TTS")
+            yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)

utils/tts_dia.py CHANGED Viewed

@@ -1,135 +1,207 @@
-import os
-import time
 import logging
 import numpy as np
 import soundfile as sf
-from pathlib import Path
-from typing import Optional
 # Configure logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Flag to track Dia availability
 DIA_AVAILABLE = False
-# Try to import required dependencies
 try:
     import torch
-    # Try to import Dia, which will try to import dac
-    try:
-        from dia.model import Dia
-        DIA_AVAILABLE = True
-        logger.info("Dia TTS engine is available")
-    except ModuleNotFoundError as e:
-        if "dac" in str(e):
-            logger.warning("Dia TTS engine is not available due to missing 'dac' module")
-        else:
-            logger.warning(f"Dia TTS engine is not available: {str(e)}")
-        DIA_AVAILABLE = False
 except ImportError:
-    logger.warning("Torch not available, Dia TTS engine cannot be used")
     DIA_AVAILABLE = False
-# Constants
-DEFAULT_SAMPLE_RATE = 44100
-DEFAULT_MODEL_NAME = "nari-labs/Dia-1.6B"
-# Global model instance (lazy loaded)
-_model = None
 def _get_model():
-    """Lazy-load the Dia model to avoid loading it until needed"""
-    global _model
-    # Check if Dia is available before attempting to load
     if not DIA_AVAILABLE:
-        logger.warning("Dia is not available, cannot load model")
-        raise ImportError("Dia module is not available")
-    if _model is None:
-        logger.info("Loading Dia model...")
         try:
-            # Check if torch is available with correct version
-            logger.info(f"PyTorch version: {torch.__version__}")
-            logger.info(f"CUDA available: {torch.cuda.is_available()}")
-            if torch.cuda.is_available():
-                logger.info(f"CUDA version: {torch.version.cuda}")
-                logger.info(f"GPU device: {torch.cuda.get_device_name(0)}")
-            # Check if model path exists
-            logger.info(f"Attempting to load model from: {DEFAULT_MODEL_NAME}")
-            # Load the model with detailed logging
-            logger.info("Initializing Dia model...")
-            _model = Dia.from_pretrained(DEFAULT_MODEL_NAME, compute_dtype="float16")
-            # Log model details
-            logger.info(f"Dia model loaded successfully")
-            logger.info(f"Model type: {type(_model).__name__}")
-            # Check if model has parameters method (PyTorch models do, but Dia might not)
-            if hasattr(_model, 'parameters'):
-                logger.info(f"Model device: {next(_model.parameters()).device}")
             else:
-                logger.info("Model device: Device information not available for Dia model")
-        except ImportError as import_err:
-            logger.error(f"Import error loading Dia model: {import_err}")
-            logger.error(f"This may indicate missing dependencies")
-            raise
-        except FileNotFoundError as file_err:
-            logger.error(f"File not found error loading Dia model: {file_err}")
-            logger.error(f"Model path may be incorrect or inaccessible")
-            raise
         except Exception as e:
-            logger.error(f"Error loading Dia model: {e}", exc_info=True)
-            logger.error(f"Error type: {type(e).__name__}")
-            logger.error(f"This may indicate incompatible versions or missing CUDA support")
-            raise
-    return _model
-def generate_speech(text: str, language: str = "zh") -> str:
-    """Public interface for TTS generation using Dia model
-    This is a legacy function maintained for backward compatibility.
-    New code should use the factory pattern implementation directly.
-    Args:
-        text (str): Input text to synthesize
-        language (str): Language code (not used in Dia model, kept for API compatibility)
-    Returns:
-        str: Path to the generated audio file
-    """
-    logger.info(f"Legacy Dia generate_speech called with text length: {len(text)}")
-    # Check if Dia is available
-    if not DIA_AVAILABLE:
-        logger.warning("Dia is not available, falling back to dummy TTS engine")
-        from utils.tts_base import DummyTTSEngine
-        dummy_engine = DummyTTSEngine(language)
-        return dummy_engine.generate_speech(text)
-    # Use the new implementation via factory pattern
-    try:
-        # Import here to avoid circular imports
-        from utils.tts_engines import DiaTTSEngine
-        # Create a Dia engine and generate speech
-        dia_engine = DiaTTSEngine(language)
-        return dia_engine.generate_speech(text)
-    except ModuleNotFoundError as e:
-        logger.error(f"Module not found error in Dia generate_speech: {str(e)}")
-        if "dac" in str(e):
-            logger.warning("Dia TTS engine failed due to missing 'dac' module, falling back to dummy TTS")
-        # Fall back to dummy TTS
-        from utils.tts_base import DummyTTSEngine
-        dummy_engine = DummyTTSEngine(language)
-        return dummy_engine.generate_speech(text)
-    except Exception as e:
-        logger.error(f"Error in legacy Dia generate_speech: {str(e)}", exc_info=True)
-        # Fall back to dummy TTS
-        from utils.tts_base import DummyTTSEngine
-        dummy_engine = DummyTTSEngine(language)
-        return dummy_engine.generate_speech(text)

 import logging
 import numpy as np
 import soundfile as sf
+from typing import Optional, Generator, Tuple
+from utils.tts_simplified import TTSBase, DummyTTS
 # Configure logging
 logger = logging.getLogger(__name__)
 # Flag to track Dia availability
 DIA_AVAILABLE = False
+DEFAULT_SAMPLE_RATE = 24000
+# Try to import Dia dependencies
 try:
     import torch
+    from dia.model import Dia
+    DIA_AVAILABLE = True
+    logger.info("Dia TTS engine is available")
 except ImportError:
+    logger.warning("Dia TTS engine is not available")
+except ModuleNotFoundError as e:
+    if "dac" in str(e):
+        logger.warning("Dia TTS engine is not available due to missing 'dac' module")
+    else:
+        logger.warning(f"Dia TTS engine is not available: {str(e)}")
     DIA_AVAILABLE = False
 def _get_model():
+    """Lazy-load the Dia model
+    Returns:
+        Dia or None: The Dia model or None if not available
+    """
     if not DIA_AVAILABLE:
+        logger.warning("Dia TTS engine is not available")
+        return None
+    try:
+        import torch
+        from dia.model import Dia
+        # Initialize the model
+        model = Dia.from_pretrained()
+        logger.info("Dia model successfully loaded")
+        return model
+    except ImportError as e:
+        logger.error(f"Failed to import Dia dependencies: {str(e)}")
+        return None
+    except FileNotFoundError as e:
+        logger.error(f"Failed to load Dia model files: {str(e)}")
+        return None
+    except Exception as e:
+        logger.error(f"Failed to initialize Dia model: {str(e)}")
+        return None
+class DiaTTS(TTSBase):
+    """Dia TTS engine implementation
+    This engine uses the Dia model for TTS generation.
+    """
+    def __init__(self, lang_code: str = 'z'):
+        """Initialize the Dia TTS engine
+        Args:
+            lang_code (str): Language code for the engine
+        """
+        super().__init__(lang_code)
+        self.model = None
+    def _ensure_model(self):
+        """Ensure the model is loaded
+        Returns:
+            bool: True if model is available, False otherwise
+        """
+        if self.model is None:
+            self.model = _get_model()
+        return self.model is not None
+    def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
+        """Generate speech using Dia TTS engine
+        Args:
+            text (str): Input text to synthesize
+            voice (str): Voice ID (not used in Dia)
+            speed (float): Speech speed multiplier (not used in Dia)
+        Returns:
+            Optional[str]: Path to the generated audio file or None if generation fails
+        """
+        logger.info(f"Generating speech with Dia for text length: {len(text)}")
+        # Check if Dia is available
+        if not DIA_AVAILABLE:
+            logger.warning("Dia TTS engine is not available, falling back to dummy TTS")
+            return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
+        # Ensure model is loaded
+        if not self._ensure_model():
+            logger.warning("Failed to load Dia model, falling back to dummy TTS")
+            return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
         try:
+            import torch
+            # Generate unique output path
+            output_path = self._generate_output_path(prefix="dia")
+            # Generate audio
+            with torch.inference_mode():
+                output_audio_np = self.model.generate(
+                    text,
+                    max_tokens=None,
+                    cfg_scale=3.0,
+                    temperature=1.3,
+                    top_p=0.95,
+                    cfg_filter_top_k=35,
+                    use_torch_compile=False,
+                    verbose=False
+                )
+            if output_audio_np is not None:
+                logger.info(f"Successfully generated audio with Dia (length: {len(output_audio_np)})")
+                sf.write(output_path, output_audio_np, DEFAULT_SAMPLE_RATE)
+                logger.info(f"Dia audio generation complete: {output_path}")
+                return output_path
             else:
+                logger.warning("Dia model returned None for audio output")
+                logger.warning("Falling back to dummy TTS")
+                return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
+        except ModuleNotFoundError as e:
+            if "dac" in str(e):
+                logger.warning("Dia TTS engine failed due to missing 'dac' module, falling back to dummy TTS")
+            else:
+                logger.error(f"Module not found error in Dia TTS: {str(e)}")
+            return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
         except Exception as e:
+            logger.error(f"Error generating speech with Dia: {str(e)}", exc_info=True)
+            logger.warning("Dia TTS engine failed, falling back to dummy TTS")
+            return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
+    def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
+        """Generate speech stream using Dia TTS engine
+        Args:
+            text (str): Input text to synthesize
+            voice (str): Voice ID (not used in Dia)
+            speed (float): Speech speed multiplier (not used in Dia)
+        Yields:
+            tuple: (sample_rate, audio_data) pairs for each segment
+        """
+        logger.info(f"Generating speech stream with Dia for text length: {len(text)}")
+        # Check if Dia is available
+        if not DIA_AVAILABLE:
+            logger.warning("Dia TTS engine is not available, falling back to dummy TTS")
+            yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
+            return
+        # Ensure model is loaded
+        if not self._ensure_model():
+            logger.warning("Failed to load Dia model, falling back to dummy TTS")
+            yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
+            return
+        try:
+            import torch
+            # Generate audio
+            with torch.inference_mode():
+                output_audio_np = self.model.generate(
+                    text,
+                    max_tokens=None,
+                    cfg_scale=3.0,
+                    temperature=1.3,
+                    top_p=0.95,
+                    cfg_filter_top_k=35,
+                    use_torch_compile=False,
+                    verbose=False
+                )
+            if output_audio_np is not None:
+                logger.info(f"Successfully generated audio with Dia (length: {len(output_audio_np)})")
+                yield DEFAULT_SAMPLE_RATE, output_audio_np
+            else:
+                logger.warning("Dia model returned None for audio output")
+                logger.warning("Falling back to dummy TTS")
+                yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
+        except ModuleNotFoundError as e:
+            if "dac" in str(e):
+                logger.warning("Dia TTS engine failed due to missing 'dac' module, falling back to dummy TTS")
+            else:
+                logger.error(f"Module not found error in Dia TTS: {str(e)}")
+            yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
+        except Exception as e:
+            logger.error(f"Error generating speech stream with Dia: {str(e)}", exc_info=True)
+            logger.warning("Dia TTS engine failed, falling back to dummy TTS")
+            yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)

utils/tts_dia_space.py DELETED Viewed

@@ -1,154 +0,0 @@
-import os
-import time
-import logging
-import requests
-import numpy as np
-import soundfile as sf
-from typing import Optional, Tuple, Generator
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Constants
-DEFAULT_SAMPLE_RATE = 44100
-DEFAULT_API_URL = "https://droolingpanda-dia-tts-server.hf.space"
-DEFAULT_MODEL = "dia-1.6b"
-# Global client instance (lazy loaded)
-_client = None
-def _get_client():
-    """Lazy-load the Dia Space client to avoid loading it until needed"""
-    global _client
-    if _client is None:
-        logger.info("Loading Dia Space client...")
-        try:
-            # Import requests if not already imported
-            import requests
-            # Initialize the client (just a session for now)
-            logger.info("Initializing Dia Space client")
-            _client = requests.Session()
-            # Test connection to the API
-            response = _client.get(f"{DEFAULT_API_URL}/docs")
-            if response.status_code == 200:
-                logger.info("Dia Space client loaded successfully")
-                logger.info(f"Client type: {type(_client).__name__}")
-            else:
-                logger.warning(f"Dia Space API returned status code {response.status_code}")
-        except ImportError as import_err:
-            logger.error(f"Import error loading Dia Space client: {import_err}")
-            logger.error("This may indicate missing dependencies")
-            raise
-        except Exception as e:
-            logger.error(f"Error loading Dia Space client: {e}", exc_info=True)
-            logger.error(f"Error type: {type(e).__name__}")
-            raise
-    return _client
-def generate_speech(text: str, language: str = "zh", voice: str = "S1", response_format: str = "wav", speed: float = 1.0) -> str:
-    """Public interface for TTS generation using Dia Space API
-    This is a legacy function maintained for backward compatibility.
-    New code should use the factory pattern implementation directly.
-    Args:
-        text (str): Input text to synthesize
-        language (str): Language code (not used in Dia Space, kept for API compatibility)
-        voice (str): Voice mode to use ('S1', 'S2', 'dialogue', or filename for clone)
-        response_format (str): Audio format ('wav', 'mp3', 'opus')
-        speed (float): Speech speed multiplier
-    Returns:
-        str: Path to the generated audio file
-    """
-    logger.info(f"Legacy Dia Space generate_speech called with text length: {len(text)}")
-    # Use the new implementation via factory pattern
-    from utils.tts_engines import DiaSpaceTTSEngine
-    try:
-        # Create a Dia Space engine and generate speech
-        dia_space_engine = DiaSpaceTTSEngine(language)
-        return dia_space_engine.generate_speech(text, voice, speed, response_format)
-    except Exception as e:
-        logger.error(f"Error in legacy Dia Space generate_speech: {str(e)}", exc_info=True)
-        # Fall back to dummy TTS
-        from utils.tts_base import DummyTTSEngine
-        dummy_engine = DummyTTSEngine()
-        return dummy_engine.generate_speech(text)
-def _create_output_dir() -> str:
-    """Create output directory for audio files
-    Returns:
-        str: Path to the output directory
-    """
-    output_dir = "temp/outputs"
-    os.makedirs(output_dir, exist_ok=True)
-    return output_dir
-def _generate_output_path(prefix: str = "output", extension: str = "wav") -> str:
-    """Generate a unique output path for audio files
-    Args:
-        prefix (str): Prefix for the output filename
-        extension (str): File extension for the output file
-    Returns:
-        str: Path to the output file
-    """
-    output_dir = _create_output_dir()
-    timestamp = int(time.time())
-    return f"{output_dir}/{prefix}_{timestamp}.{extension}"
-def _call_dia_api(text: str, voice: str = "S1", response_format: str = "wav", speed: float = 1.0) -> bytes:
-    """Call the Dia Space API to generate speech
-    Args:
-        text (str): Input text to synthesize
-        voice (str): Voice mode to use ('S1', 'S2', 'dialogue', or filename for clone)
-        response_format (str): Audio format ('wav', 'mp3', 'opus')
-        speed (float): Speech speed multiplier
-    Returns:
-        bytes: Audio data
-    """
-    client = _get_client()
-    # Prepare the request payload
-    payload = {
-        "model": DEFAULT_MODEL,
-        "input": text,
-        "voice": voice,
-        "response_format": response_format,
-        "speed": speed
-    }
-    # Make the API request
-    logger.info(f"Calling Dia Space API with voice: {voice}, format: {response_format}, speed: {speed}")
-    try:
-        response = client.post(
-            f"{DEFAULT_API_URL}/v1/audio/speech",
-            json=payload,
-            headers={"Content-Type": "application/json"}
-        )
-        # Check for successful response
-        if response.status_code == 200:
-            logger.info("Dia Space API call successful")
-            return response.content
-        else:
-            logger.error(f"Dia Space API returned error: {response.status_code}")
-            logger.error(f"Response: {response.text}")
-            raise Exception(f"Dia Space API error: {response.status_code}")
-    except Exception as e:
-        logger.error(f"Error calling Dia Space API: {str(e)}", exc_info=True)
-        raise

utils/tts_engines.py DELETED Viewed

@@ -1,419 +0,0 @@
-import logging
-import time
-import os
-import numpy as np
-import soundfile as sf
-from typing import Dict, List, Optional, Tuple, Generator, Any, Union
-from utils.tts_base import TTSEngineBase, DummyTTSEngine
-# Configure logging
-logger = logging.getLogger(__name__)
-# Flag to track TTS engine availability
-KOKORO_AVAILABLE = False
-KOKORO_SPACE_AVAILABLE = True
-DIA_AVAILABLE = False
-DIA_SPACE_AVAILABLE = True
-# Try to import Kokoro
-try:
-    from kokoro import KPipeline
-    KOKORO_AVAILABLE = True
-    logger.info("Kokoro TTS engine is available")
-except AttributeError as e:
-    # Specifically catch the EspeakWrapper.set_data_path error
-    if "EspeakWrapper" in str(e) and "set_data_path" in str(e):
-        logger.warning("Kokoro import failed due to EspeakWrapper.set_data_path issue, falling back to Kokoro FastAPI server")
-    else:
-        # Re-raise if it's a different error
-        logger.error(f"Kokoro import failed with unexpected error: {str(e)}")
-        raise
-except ImportError:
-    logger.warning("Kokoro TTS engine is not available")
-# Try to import Dia dependencies to check availability
-try:
-    import torch
-    from dia.model import Dia
-    DIA_AVAILABLE = True
-    logger.info("Dia TTS engine is available")
-except ImportError:
-    logger.warning("Dia TTS engine is not available")
-except ModuleNotFoundError as e:
-    if "dac" in str(e):
-        logger.warning("Dia TTS engine is not available due to missing 'dac' module")
-    else:
-        logger.warning(f"Dia TTS engine is not available: {str(e)}")
-    DIA_AVAILABLE = False
-class KokoroTTSEngine(TTSEngineBase):
-    """Kokoro TTS engine implementation
-    This engine uses the Kokoro library for TTS generation.
-    """
-    def __init__(self, lang_code: str = 'z'):
-        super().__init__(lang_code)
-        try:
-            self.pipeline = KPipeline(lang_code=lang_code)
-            logger.info("Kokoro TTS engine successfully initialized")
-        except Exception as e:
-            logger.error(f"Failed to initialize Kokoro pipeline: {str(e)}")
-            logger.error(f"Error type: {type(e).__name__}")
-            raise
-    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Optional[str]:
-        """Generate speech using Kokoro TTS engine
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
-            speed (float): Speech speed multiplier (0.5 to 2.0)
-        Returns:
-            Optional[str]: Path to the generated audio file or None if generation fails
-        """
-        logger.info(f"Generating speech with Kokoro for text length: {len(text)}")
-        # Generate unique output path
-        output_path = self._generate_output_path()
-        # Generate speech
-        generator = self.pipeline(text, voice=voice, speed=speed)
-        for _, _, audio in generator:
-            logger.info(f"Saving Kokoro audio to {output_path}")
-            sf.write(output_path, audio, 24000)
-            break
-        logger.info(f"Kokoro audio generation complete: {output_path}")
-        return output_path
-    def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
-        """Generate speech stream using Kokoro TTS engine
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice ID to use
-            speed (float): Speech speed multiplier
-        Yields:
-            tuple: (sample_rate, audio_data) pairs for each segment
-        """
-        logger.info(f"Generating speech stream with Kokoro for text length: {len(text)}")
-        # Generate speech stream
-        generator = self.pipeline(text, voice=voice, speed=speed)
-        for _, _, audio in generator:
-            yield 24000, audio
-class KokoroSpaceTTSEngine(TTSEngineBase):
-    """Kokoro Space TTS engine implementation
-    This engine uses the Kokoro FastAPI server for TTS generation.
-    """
-    def __init__(self, lang_code: str = 'z'):
-        super().__init__(lang_code)
-        try:
-            from gradio_client import Client
-            self.client = Client("Remsky/Kokoro-TTS-Zero")
-            logger.info("Kokoro Space TTS engine successfully initialized")
-        except Exception as e:
-            logger.error(f"Failed to initialize Kokoro Space client: {str(e)}")
-            logger.error(f"Error type: {type(e).__name__}")
-            raise
-    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Optional[str]:
-        """Generate speech using Kokoro Space TTS engine
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
-            speed (float): Speech speed multiplier (0.5 to 2.0)
-        Returns:
-            Optional[str]: Path to the generated audio file or None if generation fails
-        """
-        logger.info(f"Generating speech with Kokoro Space for text length: {len(text)}")
-        logger.info(f"Text to generate speech on is: {text[:50]}..." if len(text) > 50 else f"Text to generate speech on is: {text}")
-        # Generate unique output path
-        output_path = self._generate_output_path()
-        try:
-            # Use af_nova as the default voice for Kokoro Space
-            voice_to_use = 'af_nova' if voice == 'af_heart' else voice
-            # Generate speech
-            result = self.client.predict(
-                text=text,
-                voice_names=voice_to_use,
-                speed=speed,
-                api_name="/generate_speech_from_ui"
-            )
-            logger.info(f"Received audio from Kokoro FastAPI server: {result}")
-            # Process the result and save to output_path
-            # Return the result path directly if it's a string
-            if isinstance(result, str) and os.path.exists(result):
-                return result
-            else:
-                logger.warning("Unexpected result from Kokoro Space")
-                return None
-        except Exception as e:
-            logger.error(f"Failed to generate speech from Kokoro FastAPI server: {str(e)}")
-            logger.error(f"Error type: {type(e).__name__}")
-            logger.info("Kokoro Space TTS engine failed")
-            return None
-class DiaTTSEngine(TTSEngineBase):
-    """Dia TTS engine implementation
-    This engine uses the Dia model for TTS generation.
-    """
-    def __init__(self, lang_code: str = 'z'):
-        super().__init__(lang_code)
-        # Dia doesn't need initialization here, it will be lazy-loaded when needed
-        logger.info("Dia TTS engine initialized (lazy loading)")
-    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Optional[str]:
-        """Generate speech using Dia TTS engine
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice ID (not used in Dia)
-            speed (float): Speech speed multiplier (not used in Dia)
-        Returns:
-            Optional[str]: Path to the generated audio file or None if generation fails
-        """
-        logger.info(f"Generating speech with Dia for text length: {len(text)}")
-        try:
-            # Import here to avoid circular imports
-            from utils.tts_dia import generate_speech as dia_generate_speech, DIA_AVAILABLE
-            # Check if Dia is available
-            if not DIA_AVAILABLE:
-                logger.warning("Dia TTS engine is not available")
-                return None
-            logger.info("Successfully imported Dia speech generation function")
-            # Call Dia's generate_speech function
-            # Note: Dia's function expects a language parameter, not voice or speed
-            output_path = dia_generate_speech(text, language=self.lang_code)
-            logger.info(f"Generated audio with Dia: {output_path}")
-            return output_path
-        except ModuleNotFoundError as e:
-            if "dac" in str(e):
-                logger.warning("Dia TTS engine failed due to missing 'dac' module")
-                return None
-            raise
-        except Exception as e:
-            logger.error(f"Error generating speech with Dia: {str(e)}", exc_info=True)
-            logger.warning("Dia TTS engine failed")
-            return None
-class DiaSpaceTTSEngine(TTSEngineBase):
-    """Dia Space TTS engine implementation
-    This engine uses the Dia TTS Server API for speech generation.
-    """
-    def __init__(self, lang_code: str = 'z'):
-        super().__init__(lang_code)
-        try:
-            # Import here to avoid circular imports
-            from utils.tts_dia_space import _get_client
-            self.client = _get_client()
-            logger.info("Dia Space TTS engine successfully initialized")
-        except Exception as e:
-            logger.error(f"Failed to initialize Dia Space client: {str(e)}")
-            logger.error(f"Error type: {type(e).__name__}")
-            raise
-    def generate_speech(self, text: str, voice: str = 'S1', speed: float = 1.0, response_format: str = 'wav') -> Optional[str]:
-        """Generate speech using Dia Space TTS engine
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice mode to use ('S1', 'S2', 'dialogue', or filename for clone)
-            speed (float): Speech speed multiplier
-            response_format (str): Audio format ('wav', 'mp3', 'opus')
-        Returns:
-            Optional[str]: Path to the generated audio file or None if generation fails
-        """
-        logger.info(f"Generating speech with Dia Space for text length: {len(text)}")
-        try:
-            # Import here to avoid circular imports
-            from utils.tts_dia_space import _call_dia_api, _generate_output_path
-            # Call the Dia Space API
-            audio_data = _call_dia_api(text, voice, response_format, speed)
-            # Save the audio data to a file
-            output_path = _generate_output_path(prefix="dia_space", extension=response_format)
-            with open(output_path, 'wb') as f:
-                f.write(audio_data)
-            logger.info(f"Generated audio with Dia Space: {output_path}")
-            return output_path
-        except Exception as e:
-            logger.error(f"Failed to generate speech from Dia Space API: {str(e)}")
-            logger.error(f"Error type: {type(e).__name__}")
-            logger.info("Dia Space TTS engine failed")
-            return None
-        except ImportError as import_err:
-            logger.error(f"Dia TTS generation failed due to import error: {str(import_err)}")
-            logger.error("Dia Space TTS engine failed")
-            return None
-        except Exception as dia_error:
-            logger.error(f"Dia TTS generation failed: {str(dia_error)}", exc_info=True)
-            logger.error(f"Error type: {type(dia_error).__name__}")
-            logger.error("Dia Space TTS engine failed")
-            return None
-    def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
-        """Generate speech stream using Dia TTS engine
-        Args:
-            text (str): Input text to synthesize
-            voice (str): Voice ID (not used in Dia)
-            speed (float): Speech speed multiplier (not used in Dia)
-        Yields:
-            tuple: (sample_rate, audio_data) pairs for each segment
-        """
-        logger.info(f"Generating speech stream with Dia for text length: {len(text)}")
-        try:
-            # Import required modules
-            from utils.tts_dia import _get_model, DEFAULT_SAMPLE_RATE, DIA_AVAILABLE
-            # Check if Dia is available
-            if not DIA_AVAILABLE:
-                logger.warning("Dia TTS engine is not available, falling back to dummy audio stream")
-                yield from DummyTTSEngine(self.lang_code).generate_speech_stream(text, voice, speed)
-                return
-            import torch
-            # Get the Dia model
-            model = _get_model()
-            # Generate audio
-            with torch.inference_mode():
-                output_audio_np = model.generate(
-                    text,
-                    max_tokens=None,
-                    cfg_scale=3.0,
-                    temperature=1.3,
-                    top_p=0.95,
-                    cfg_filter_top_k=35,
-                    use_torch_compile=False,
-                    verbose=False
-                )
-            if output_audio_np is not None:
-                logger.info(f"Successfully generated audio with Dia (length: {len(output_audio_np)})")
-                yield DEFAULT_SAMPLE_RATE, output_audio_np
-            else:
-                logger.warning("Dia model returned None for audio output")
-                logger.warning("Falling back to dummy audio stream")
-                yield from DummyTTSEngine(self.lang_code).generate_speech_stream(text, voice, speed)
-        except ModuleNotFoundError as e:
-            if "dac" in str(e):
-                logger.warning("Dia TTS streaming failed due to missing 'dac' module, falling back to dummy audio stream")
-            else:
-                logger.error(f"Module not found error in Dia TTS streaming: {str(e)}")
-            yield from DummyTTSEngine(self.lang_code).generate_speech_stream(text, voice, speed)
-        except ImportError as import_err:
-            logger.error(f"Dia TTS streaming failed due to import error: {str(import_err)}")
-            logger.error("Falling back to dummy audio stream")
-            yield from DummyTTSEngine(self.lang_code).generate_speech_stream(text, voice, speed)
-        except Exception as dia_error:
-            logger.error(f"Dia TTS streaming failed: {str(dia_error)}", exc_info=True)
-            logger.error(f"Error type: {type(dia_error).__name__}")
-            logger.error("Falling back to dummy audio stream")
-            yield from DummyTTSEngine(self.lang_code).generate_speech_stream(text, voice, speed)
-def get_available_engines() -> List[str]:
-    """Get a list of available TTS engines
-    Returns:
-        List[str]: List of available engine names
-    """
-    available = []
-    if KOKORO_AVAILABLE:
-        available.append('kokoro')
-    if KOKORO_SPACE_AVAILABLE:
-        available.append('kokoro_space')
-    if DIA_AVAILABLE:
-        available.append('dia')
-    if DIA_SPACE_AVAILABLE:
-        available.append('dia_space')
-    # Dummy is always available
-    available.append('dummy')
-    return available
-def create_engine(engine_type: str, lang_code: str = 'z') -> TTSEngineBase:
-    """Create a specific TTS engine
-    Args:
-        engine_type (str): Type of engine to create ('kokoro', 'kokoro_space', 'dia', 'dia_space', 'dummy')
-        lang_code (str): Language code for the engine
-    Returns:
-        TTSEngineBase: An instance of the requested TTS engine
-    Raises:
-        ValueError: If the requested engine type is not supported
-    """
-    if engine_type == 'kokoro':
-        if not KOKORO_AVAILABLE:
-            raise ValueError("Kokoro TTS engine is not available")
-        return KokoroTTSEngine(lang_code)
-    elif engine_type == 'kokoro_space':
-        if not KOKORO_SPACE_AVAILABLE:
-            raise ValueError("Kokoro Space TTS engine is not available")
-        return KokoroSpaceTTSEngine(lang_code)
-    elif engine_type == 'dia':
-        if not DIA_AVAILABLE:
-            raise ValueError("Dia TTS engine is not available")
-        return DiaTTSEngine(lang_code)
-    elif engine_type == 'dia_space':
-        if not DIA_SPACE_AVAILABLE:
-            raise ValueError("Dia Space TTS engine is not available")
-        return DiaSpaceTTSEngine(lang_code)
-    elif engine_type == 'dummy':
-        return DummyTTSEngine(lang_code)
-    else:
-        raise ValueError(f"Unsupported TTS engine type: {engine_type}")

utils/tts_factory.py DELETED Viewed

@@ -1,77 +0,0 @@
-import logging
-from typing import Optional, List
-# Configure logging
-logger = logging.getLogger(__name__)
-# Import the base class
-from utils.tts_base import TTSEngineBase, DummyTTSEngine
-from utils.tts_cascading import CascadingTTSEngine
-class TTSFactory:
-    """Factory class for creating TTS engines
-    This class is responsible for creating the appropriate TTS engine based on
-    availability and configuration.
-    """
-    @staticmethod
-    def create_engine(engine_type: Optional[str] = None, lang_code: str = 'z') -> TTSEngineBase:
-        """Create a TTS engine instance
-        Args:
-            engine_type (str, optional): Type of engine to create ('kokoro', 'kokoro_space', 'dia', 'dummy')
-                                        If None, the best available engine will be used
-            lang_code (str): Language code for the engine
-        Returns:
-            TTSEngineBase: An instance of a TTS engine
-        """
-        from utils.tts_engines import get_available_engines, create_engine
-        # Get available engines
-        available_engines = get_available_engines()
-        logger.info(f"Available TTS engines: {available_engines}")
-        # If engine_type is specified, try to create that specific engine
-        if engine_type is not None:
-            if engine_type in available_engines:
-                logger.info(f"Creating requested engine: {engine_type}")
-                engine = create_engine(engine_type, lang_code)
-                return engine
-            else:
-                logger.warning(f"Requested engine '{engine_type}' is not available")
-        # Fall back to dummy engine if no engines are available
-        if not available_engines or (len(available_engines) == 1 and available_engines[0] == 'dummy'):
-            logger.warning("No TTS engines available, falling back to dummy engine")
-            return DummyTTSEngine(lang_code)
-        return TTSFactory.create_cascading_engine(available_engines, lang_code)
-    @staticmethod
-    def create_cascading_engine(available_engines: List[str], lang_code: str = 'z') -> TTSEngineBase:
-        """Create a cascading TTS engine that tries multiple engines in order
-        Args:
-            available_engines (List[str]): List of available engine names
-            lang_code (str): Language code for the engines
-        Returns:
-            TTSEngineBase: A cascading TTS engine instance
-        """
-        from utils.tts_engines import create_engine
-        # Define the priority order for engines
-        priority_order = ['kokoro', 'kokoro_space', 'dia', 'dia_space', 'dummy']
-        # Filter and sort available engines by priority
-        engines_by_priority = [engine for engine in priority_order if engine in available_engines]
-        # Always ensure dummy is the last fallback
-        if 'dummy' not in engines_by_priority:
-            engines_by_priority.append('dummy')
-        logger.info(f"Creating cascading engine with priority: {engines_by_priority}")
-        return CascadingTTSEngine(engines_by_priority, lang_code)

utils/tts_kokoro.py CHANGED Viewed

@@ -1,106 +1,148 @@
-import os
-import time
 import logging
 import numpy as np
 import soundfile as sf
-from typing import Optional, Tuple, Generator
 # Configure logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Constants
-DEFAULT_SAMPLE_RATE = 24000
-# Global model instance (lazy loaded)
-_pipeline = None
 def _get_pipeline(lang_code: str = 'z'):
-    """Lazy-load the Kokoro pipeline to avoid loading it until needed"""
-    global _pipeline
-    if _pipeline is None:
-        logger.info("Loading Kokoro pipeline...")
-        try:
-            # Import Kokoro
-            from kokoro import KPipeline
-            # Initialize the pipeline
-            logger.info(f"Initializing Kokoro pipeline with language code: {lang_code}")
-            _pipeline = KPipeline(lang_code=lang_code)
-            # Log pipeline details
-            logger.info(f"Kokoro pipeline loaded successfully")
-            logger.info(f"Pipeline type: {type(_pipeline).__name__}")
-        except ImportError as import_err:
-            logger.error(f"Import error loading Kokoro pipeline: {import_err}")
-            logger.error(f"This may indicate missing dependencies")
-            raise
-        except Exception as e:
-            logger.error(f"Error loading Kokoro pipeline: {e}", exc_info=True)
-            logger.error(f"Error type: {type(e).__name__}")
-            raise
-    return _pipeline
-def generate_speech(text: str, language: str = "z", voice: str = "af_heart", speed: float = 1.0) -> str:
-    """Public interface for TTS generation using Kokoro model
-    This is a legacy function maintained for backward compatibility.
-    New code should use the factory pattern implementation directly.
     Args:
-        text (str): Input text to synthesize
-        language (str): Language code ('a' for US English, 'b' for British English,
-                      'j' for Japanese, 'z' for Mandarin Chinese)
-        voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
-        speed (float): Speech speed multiplier (0.5 to 2.0)
     Returns:
-        str: Path to the generated audio file
     """
-    logger.info(f"Legacy Kokoro generate_speech called with text length: {len(text)}")
-    # Use the new implementation via factory pattern
-    from utils.tts_engines import KokoroTTSEngine
     try:
-        # Create a Kokoro engine and generate speech
-        kokoro_engine = KokoroTTSEngine(language)
-        return kokoro_engine.generate_speech(text, voice, speed)
     except Exception as e:
-        logger.error(f"Error in legacy Kokoro generate_speech: {str(e)}", exc_info=True)
-        # Fall back to dummy TTS
-        from utils.tts_base import DummyTTSEngine
-        dummy_engine = DummyTTSEngine()
-        return dummy_engine.generate_speech(text)
-def generate_speech_stream(text: str, language: str = "z", voice: str = "af_heart", speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
-    """Generate speech stream using Kokoro TTS engine
-    Args:
-        text (str): Input text to synthesize
-        language (str): Language code
-        voice (str): Voice ID to use
-        speed (float): Speech speed multiplier
-    Yields:
-        tuple: (sample_rate, audio_data) pairs for each segment
     """
-    logger.info(f"Generating speech stream with Kokoro for text length: {len(text)}")
-    try:
-        # Get the Kokoro pipeline
-        pipeline = _get_pipeline(language)
-        # Generate speech stream
-        generator = pipeline(text, voice=voice, speed=speed)
-        for _, _, audio in generator:
-            yield DEFAULT_SAMPLE_RATE, audio
-    except Exception as e:
-        logger.error(f"Error in Kokoro generate_speech_stream: {str(e)}", exc_info=True)
-        # Fall back to dummy TTS
-        from utils.tts_base import DummyTTSEngine
-        dummy_engine = DummyTTSEngine()
-        yield from dummy_engine.generate_speech_stream(text)

 import logging
 import numpy as np
 import soundfile as sf
+from typing import Optional, Generator, Tuple
+from utils.tts_simplified import TTSBase, DummyTTS
 # Configure logging
 logger = logging.getLogger(__name__)
+# Flag to track Kokoro availability
+KOKORO_AVAILABLE = False
+# Try to import Kokoro
+try:
+    from kokoro import KPipeline
+    KOKORO_AVAILABLE = True
+    logger.info("Kokoro TTS engine is available")
+except ImportError:
+    logger.warning("Kokoro TTS engine is not available")
+except Exception as e:
+    logger.error(f"Kokoro import failed with unexpected error: {str(e)}")
+    KOKORO_AVAILABLE = False
 def _get_pipeline(lang_code: str = 'z'):
+    """Lazy-load the Kokoro pipeline
     Args:
+        lang_code (str): Language code for the pipeline
     Returns:
+        KPipeline or None: The Kokoro pipeline or None if not available
     """
+    if not KOKORO_AVAILABLE:
+        logger.warning("Kokoro TTS engine is not available")
+        return None
     try:
+        pipeline = KPipeline(lang_code=lang_code)
+        logger.info("Kokoro pipeline successfully loaded")
+        return pipeline
     except Exception as e:
+        logger.error(f"Failed to initialize Kokoro pipeline: {str(e)}")
+        return None
+class KokoroTTS(TTSBase):
+    """Kokoro TTS engine implementation
+    This engine uses the Kokoro library for TTS generation.
     """
+    def __init__(self, lang_code: str = 'z'):
+        """Initialize the Kokoro TTS engine
+        Args:
+            lang_code (str): Language code for the engine
+        """
+        super().__init__(lang_code)
+        self.pipeline = None
+    def _ensure_pipeline(self):
+        """Ensure the pipeline is loaded
+        Returns:
+            bool: True if pipeline is available, False otherwise
+        """
+        if self.pipeline is None:
+            self.pipeline = _get_pipeline(self.lang_code)
+        return self.pipeline is not None
+    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Optional[str]:
+        """Generate speech using Kokoro TTS engine
+        Args:
+            text (str): Input text to synthesize
+            voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
+            speed (float): Speech speed multiplier (0.5 to 2.0)
+        Returns:
+            Optional[str]: Path to the generated audio file or None if generation fails
+        """
+        logger.info(f"Generating speech with Kokoro for text length: {len(text)}")
+        # Check if Kokoro is available
+        if not KOKORO_AVAILABLE:
+            logger.warning("Kokoro TTS engine is not available, falling back to dummy TTS")
+            return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
+        # Ensure pipeline is loaded
+        if not self._ensure_pipeline():
+            logger.warning("Failed to load Kokoro pipeline, falling back to dummy TTS")
+            return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
+        try:
+            # Generate unique output path
+            output_path = self._generate_output_path(prefix="kokoro")
+            # Generate speech
+            generator = self.pipeline(text, voice=voice, speed=speed)
+            for _, _, audio in generator:
+                logger.info(f"Saving Kokoro audio to {output_path}")
+                sf.write(output_path, audio, 24000)
+                break
+            logger.info(f"Kokoro audio generation complete: {output_path}")
+            return output_path
+        except Exception as e:
+            logger.error(f"Error generating speech with Kokoro: {str(e)}", exc_info=True)
+            logger.warning("Kokoro TTS engine failed, falling back to dummy TTS")
+            return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
+    def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
+        """Generate speech stream using Kokoro TTS engine
+        Args:
+            text (str): Input text to synthesize
+            voice (str): Voice ID to use
+            speed (float): Speech speed multiplier
+        Yields:
+            tuple: (sample_rate, audio_data) pairs for each segment
+        """
+        logger.info(f"Generating speech stream with Kokoro for text length: {len(text)}")
+        # Check if Kokoro is available
+        if not KOKORO_AVAILABLE:
+            logger.warning("Kokoro TTS engine is not available, falling back to dummy TTS")
+            yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
+            return
+        # Ensure pipeline is loaded
+        if not self._ensure_pipeline():
+            logger.warning("Failed to load Kokoro pipeline, falling back to dummy TTS")
+            yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
+            return
+        try:
+            # Generate speech stream
+            generator = self.pipeline(text, voice=voice, speed=speed)
+            for _, _, audio in generator:
+                yield 24000, audio
+        except Exception as e:
+            logger.error(f"Error generating speech stream with Kokoro: {str(e)}", exc_info=True)
+            logger.warning("Kokoro TTS engine failed, falling back to dummy TTS")
+            yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)

utils/tts_kokoro_space.py DELETED Viewed

@@ -1,100 +0,0 @@
-import os
-import time
-import logging
-import numpy as np
-import soundfile as sf
-from typing import Optional, Tuple, Generator
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Constants
-DEFAULT_SAMPLE_RATE = 24000
-# Global client instance (lazy loaded)
-_client = None
-def _get_client():
-    """Lazy-load the Kokoro Space client to avoid loading it until needed"""
-    global _client
-    if _client is None:
-        logger.info("Loading Kokoro Space client...")
-        try:
-            # Import gradio client
-            from gradio_client import Client
-            # Initialize the client
-            logger.info("Initializing Kokoro Space client")
-            _client = Client("Remsky/Kokoro-TTS-Zero")
-            # Log client details
-            logger.info("Kokoro Space client loaded successfully")
-            logger.info(f"Client type: {type(_client).__name__}")
-        except ImportError as import_err:
-            logger.error(f"Import error loading Kokoro Space client: {import_err}")
-            logger.error("This may indicate missing dependencies")
-            raise
-        except Exception as e:
-            logger.error(f"Error loading Kokoro Space client: {e}", exc_info=True)
-            logger.error(f"Error type: {type(e).__name__}")
-            raise
-    return _client
-def generate_speech(text: str, language: str = "z", voice: str = "af_nova", speed: float = 1.0) -> str:
-    """Public interface for TTS generation using Kokoro Space
-    This is a legacy function maintained for backward compatibility.
-    New code should use the factory pattern implementation directly.
-    Args:
-        text (str): Input text to synthesize
-        language (str): Language code (not used in Kokoro Space, kept for API compatibility)
-        voice (str): Voice ID to use (e.g., 'af_nova', 'af_bella', etc.)
-        speed (float): Speech speed multiplier (0.5 to 2.0)
-    Returns:
-        str: Path to the generated audio file
-    """
-    logger.info(f"Legacy Kokoro Space generate_speech called with text length: {len(text)}")
-    # Use the new implementation via factory pattern
-    from utils.tts_engines import KokoroSpaceTTSEngine
-    try:
-        # Create a Kokoro Space engine and generate speech
-        kokoro_space_engine = KokoroSpaceTTSEngine(language)
-        return kokoro_space_engine.generate_speech(text, voice, speed)
-    except Exception as e:
-        logger.error(f"Error in legacy Kokoro Space generate_speech: {str(e)}", exc_info=True)
-        # Fall back to dummy TTS
-        from utils.tts_base import DummyTTSEngine
-        dummy_engine = DummyTTSEngine()
-        return dummy_engine.generate_speech(text)
-def _create_output_dir() -> str:
-    """Create output directory for audio files
-    Returns:
-        str: Path to the output directory
-    """
-    output_dir = "temp/outputs"
-    os.makedirs(output_dir, exist_ok=True)
-    return output_dir
-def _generate_output_path(prefix: str = "output") -> str:
-    """Generate a unique output path for audio files
-    Args:
-        prefix (str): Prefix for the output filename
-    Returns:
-        str: Path to the output file
-    """
-    output_dir = _create_output_dir()
-    timestamp = int(time.time())
-    return f"{output_dir}/{prefix}_{timestamp}.wav"