File size: 17,611 Bytes

29c0409

import re
import torch
import torchaudio.functional as F
import torchaudio
import uroman as ur
import logging
from typing import List, Dict, Any, Optional

def split_and_merge_punctuation(text: str) -> List[str]:
    """
    处理英文文本，按空格分词并将标点符号合并到前面的单词
    
    Args:
        text: 输入的英文文本
        
    Returns:
        处理后的单词列表，标点符号已合并到对应单词
    """
    # 先按空格拆分文本
    elements = text.split()
    
    # 用于保存最终的结果
    result = []
    
    # 遍历每个拆分后的元素
    for ele in elements:
        # 使用正则表达式提取连续字母、数字和标点
        parts = re.findall(r'[a-zA-Z0-9]+|[^\w\s]+', ele)
        
        # 用于保存拆分后的部分
        merged_parts = []
        
        for i in range(len(parts)):
            if i % 2 == 0:  # 如果是字母或数字部分
                # 将字母或数字部分添加到结果中
                merged_parts.append(parts[i])
            else:  # 如果是标点或其他符号部分
                # 将标点部分与前面的字母或数字部分合并
                if merged_parts:
                    merged_parts[-1] += parts[i]
                else:
                    merged_parts.append(parts[i])
        
        # 将合并后的部分加入最终结果
        result.extend(merged_parts)
    
    return result

def restore_spaces_in_english_text(tokens: List[str]) -> str:
    """
    在英文单词之间恢复空格
    
    Args:
        tokens: 单词列表
        
    Returns:
        恢复空格后的文本
    """
    result = []
    for i, token in enumerate(tokens):
        # 检查是否需要在单词前添加空格
        if i > 0 and token[0].isalnum() and not any(p in tokens[i-1] for p in ',.!?;:()[]<>\'\"…'):
            result.append(" ")
        result.append(token)
    
    return "".join(result)

def get_aligned_result_with_punctuation(alignment_result: List[Dict], text: str) -> List[Dict]:
    """
    将对齐结果转换为包含标点符号的格式
    
    Args:
        alignment_result: 原始对齐结果
        text: 原始文本
        
    Returns:
        处理后的对齐结果，标点符号已合并
    """
    text_tokens = split_and_merge_punctuation(text)
    
    updated_alignment_result = []
    token_idx = 0
    
    for index, align_item in enumerate(alignment_result):
        if token_idx >= len(text_tokens):
            break
            
        start = align_item["start"]
        end = align_item["end"]
        text_token = text_tokens[token_idx]
        
        updated_item = {
            "start": start,
            "end": end,
            "transcript": text_token
        }
        
        # 保留原始对齐结果中的其他字段
        updated_item.update({key: align_item[key] for key in align_item 
                           if key not in ["start", "end", "transcript"]})
        
        updated_alignment_result.append(updated_item)
        token_idx += 1
    
    return updated_alignment_result

class EnglishAlignmentModel:
    def __init__(self, device: str = "cuda", model_dir: Optional[str] = None):
        """
        初始化英文对齐模型
        
        Args:
            device: 设备类型 ("cuda" 或 "cpu")
            model_dir: 模型目录路径，如果为None则使用默认路径
        """
        self.device = torch.device(device)
        self.bundle = torchaudio.pipelines.MMS_FA
        
        # 设置模型下载参数
        dl_kwargs = {}
        if model_dir:
            dl_kwargs['model_dir'] = model_dir
            
        self.align_model = self.bundle.get_model(
            with_star=False, 
            dl_kwargs=dl_kwargs
        ).to(self.device)
        
        self.uroman = ur.Uroman()
        self.DICTIONARY = self.bundle.get_dict()

    def align(self, emission: torch.Tensor, tokens: torch.Tensor):
        """
        执行强对齐
        
        Args:
            emission: 模型的输出
            tokens: 目标tokens
            
        Returns:
            对齐的tokens和分数
        """
        alignments, scores = F.forced_align(
            log_probs=emission,
            targets=tokens,
            blank=0
        )
        alignments, scores = alignments[0], scores[0]
        scores = scores.exp()
        return alignments, scores

    def unflatten(self, list_: List, lengths: List[int]) -> List[List]:
        """
        将一个长列表按照长度拆分成子列表
        
        Args:
            list_: 长列表
            lengths: 各子列表的长度
            
        Returns:
            拆分后的子列表
        """
        assert len(list_) == sum(lengths)
        i = 0
        ret = []
        for l in lengths:
            ret.append(list_[i:i + l])
            i += l
        return ret

    def preview_word(self, waveform: torch.Tensor, spans: List, num_frames: int, 
                    transcript: List[str], sample_rate: int) -> List[Dict]:
        """
        生成每个单词的时间对齐信息
        
        Args:
            waveform: 音频波形
            spans: 单词的跨度
            num_frames: 帧数
            transcript: 转录文本单词列表
            sample_rate: 采样率
            
        Returns:
            单词的对齐信息列表
        """
        end = 0
        alignment_result = []
        
        for span, trans in zip(spans, transcript):
            ratio = waveform.size(1) / num_frames
            x0 = int(ratio * span[0].start)
            x1 = int(ratio * span[-1].end)
            
            align_info = {
                "transcript": trans,
                "start": round(x0 / sample_rate, 3),
                "end": round(x1 / sample_rate, 3)
            }
            align_info["pause"] = round(align_info["start"] - end, 3)
            align_info["duration"] = round(align_info["end"] - align_info["start"], 3)
            end = align_info["end"]
            alignment_result.append(align_info)
            
        return alignment_result

    def make_wav_batch(self, wav_list: List[torch.Tensor]):
        """
        将wav_list中的每个wav张量填充为相同的长度
        
        Args:
            wav_list: wav文件列表
            
        Returns:
            填充后的音频张量和原始长度
        """
        wav_lengths = torch.tensor([wav.size(0) for wav in wav_list], dtype=torch.long)
        max_length = max(wav_lengths)
        wavs_tensors = torch.zeros(len(wav_list), max_length, device=wav_list[0].device)
        
        for i, wav in enumerate(wav_list):
            wavs_tensors[i, :wav_lengths[i]] = wav
            
        return wavs_tensors, wav_lengths.to(wavs_tensors.device)

    def get_target(self, transcript: str) -> torch.Tensor:
        """
        获取给定英文转录文本的目标tokens
        
        Args:
            transcript: 英文转录文本
            
        Returns:
            转录文本的目标tokens
        """
        # 移除标点符号并转换为小写
        transcript = re.sub(r'[^\w\s]', r' ', transcript)
        words = transcript.lower().split()
        
        # 获取字典中的特殊符号token
        star_token = self.DICTIONARY['*']
        
        # 将每个字符转换为对应的token
        tokenized_transcript = []
        for word in words:
            tokenized_transcript.extend([
                self.DICTIONARY[c] if c in self.DICTIONARY and c != '-' else star_token 
                for c in word
            ])
        
        return torch.tensor([tokenized_transcript], dtype=torch.int32, device=self.device)

    def get_alignment_result(self, emission_padded: torch.Tensor, emission_length: int,
                           aligned_tokens: torch.Tensor, alignment_scores: torch.Tensor,
                           transcript: str, waveform: torch.Tensor) -> List[Dict]:
        """
        根据给定的emission和对齐信息生成对齐结果
        
        Args:
            emission_padded: 填充后的emission
            emission_length: emission的有效长度
            aligned_tokens: 对齐的tokens
            alignment_scores: 对齐的分数
            transcript: 转录文本
            waveform: 音频波形
            
        Returns:
            对齐结果
        """
        # 处理文本
        processed_transcript = re.sub(r'[^\w\s]', r' ', transcript)
        words = processed_transcript.lower().split()
        
        emission = emission_padded[:emission_length, :].unsqueeze(0)
        token_spans = F.merge_tokens(aligned_tokens, alignment_scores)
        word_spans = self.unflatten(token_spans, [len(word) for word in words])
        num_frames = emission.size(1)
        
        return self.preview_word(waveform.unsqueeze(0), word_spans, num_frames, 
                               words, self.bundle.sample_rate)

    def align_audio_text(self, waveform: torch.Tensor, transcript: str) -> List[Dict]:
        """
        对单个音频和文本进行对齐
        
        Args:
            waveform: 音频波形张量 (1D tensor)
            transcript: 英文转录文本
            
        Returns:
            对齐结果列表，包含每个单词的时间信息
        """
        # 确保音频在正确的设备上
        waveform = waveform.to(self.device)
        
        # 如果需要重采样
        if hasattr(self, 'original_sample_rate'):
            if self.original_sample_rate != self.bundle.sample_rate:
                waveform = F.resample(waveform, self.original_sample_rate, self.bundle.sample_rate)
        
        # 批量处理（单个样本）
        return self.batch_alignment([waveform], [transcript])[0]

    def batch_alignment(self, wav_list: List[torch.Tensor], transcript_list: List[str]) -> List[List[Dict]]:
        """
        批量对齐
        
        Args:
            wav_list: wav文件列表
            transcript_list: 转录文本列表
            
        Returns:
            对齐结果列表
        """
        wavs_tensors, wavs_lengths_tensor = self.make_wav_batch(wav_list)
        
        # 前向传播
        with torch.inference_mode():
            emission, emission_lengths = self.align_model(
                wavs_tensors.to(self.device), 
                wavs_lengths_tensor
            )
            # 添加star维度
            star_dim = torch.zeros(
                (emission.shape[0], emission.size(1), 1), 
                dtype=emission.dtype, 
                device=self.device
            )
            emission = torch.cat((emission, star_dim), dim=-1)
        
        # 获取目标tokens
        target_list = [self.get_target(transcript) for transcript in transcript_list]
        
        # 执行对齐
        align_results = [
            self.align(emission_padded[:emission_length, :].unsqueeze(0), target)
            for emission_padded, emission_length, target in zip(emission, emission_lengths, target_list)
        ]
        
        batch_aligned_tokens = [align_result[0] for align_result in align_results]
        batch_alignment_scores = [align_result[1] for align_result in align_results]

        # 生成对齐结果
        alignment_result_list = [
            self.get_alignment_result(emission_padded, emission_length, aligned_tokens, 
                                    alignment_scores, transcript, waveform)
            for emission_padded, emission_length, aligned_tokens, alignment_scores, transcript, waveform
            in zip(emission, emission_lengths, batch_aligned_tokens, batch_alignment_scores, 
                  transcript_list, wav_list)
        ]
        
        # 处理标点符号
        final_results = []
        for alignment_result, transcript in zip(alignment_result_list, transcript_list):
            processed_result = get_aligned_result_with_punctuation(alignment_result, transcript)
            final_results.append(processed_result)
        
        return final_results

def align_english_audio_text(audio_path: str, transcript: str, device: str = "cuda", 
                           model_dir: Optional[str] = None) -> List[Dict]:
    """
    便捷函数：对英文音频和文本进行对齐
    
    Args:
        audio_path: 音频文件路径
        transcript: 英文转录文本
        device: 设备类型 ("cuda" 或 "cpu")
        model_dir: 模型目录路径
        
    Returns:
        对齐结果列表，包含每个单词的时间信息
        
    Example:
        >>> result = align_english_audio_text("audio.wav", "Hello world!")
        >>> print(result)
        [
            {"transcript": "Hello", "start": 0.0, "end": 0.5, "duration": 0.5, "pause": 0.0},
            {"transcript": "world!", "start": 0.6, "end": 1.2, "duration": 0.6, "pause": 0.1}
        ]
    """
    # 加载音频
    waveform, sample_rate = torchaudio.load(audio_path)
    
    # 转换为单声道
    if waveform.size(0) > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    waveform = waveform.squeeze(0)  # 移除批次维度
    
    # 初始化模型
    model = EnglishAlignmentModel(device=device, model_dir=model_dir)
    model.original_sample_rate = sample_rate
    
    # 执行对齐
    return model.align_audio_text(waveform, transcript)

if __name__ == "__main__":
    # 使用示例
    audio_file = "/inspire/hdd/project/embodied-multimodality/public/yqzhang/infer_res/from_newckpt_step40000/test_en/gpu4/output_0.wav"
    text = "[S1]Hey, did you hear about that company called MoSi AI? [S2]MoSi AI? Yeah, I think I've heard of them. Aren't they the ones doing AI stuff? What new thing have they come up with now? [S1]Yeah, that's them! They recently launched this super hot new product called, um, Asteroid. [S2]Asteroid. That's a pretty cool name. Does it mean like the space rock? [S1]Yeah, I think that's what it means. Let me tell you, this thing is incredible. They say it's currently the most realistic, human-like conversational TTS model out there. [S2]Oh, TTS technology? You mean the text-to-speech thing? Aren't there already a lot of those on the market? What makes this one so special? [S1]Well, it's completely different. They say the voice produced by Asteroid sounds almost exactly like a real person talking. And it's super smooth and natural. Not at all like, you know, that stiff robotic tone. [S2]I see. Some voice assistants do still have that mechanical feel, especially during multi-turn conversations. So how amazing is this Asteroid exactly? [S1]I heard they internally call Asteroid China's own version of NotebookLM. [S2]NotebookLM? Oh, I know that one. Isn't that the personal AI that Google made? The one that helps organize notes and answers all kinds of questions? So Asteroid has similar functions? [S1]Right. That's probably what they mean. It's not just that the voice sounds incredibly human. The intelligence level is also really high. It can have these really logical, contextual, in-depth conversations with you. It's just like chatting with a real person. [S2]Wow, that sounds amazing. If they can really achieve that... [S1]Yeah, it's basically like having a personal assistant that's both articulate and really understands you. [S2]Hmm. That does sound appealing. [S1]And some people are saying it's like the, what's it called again in the voice technology circle? Oh right, DeepSeek. [S2]DeepSeek? Isn't that the company making large language models? Their models are pretty popular now. That's high praise. So they're saying Asteroid is top-tier technology? [S1]Yeah, I think that's what they mean. It's like they've reached a whole new level in voice synthesis. Similar to the impact DeepSeek has had in natural language processing. It could be that kind of groundbreaking technology. [S2]If Asteroid is really that impressive, where could it be used? I feel like there must be huge potential there. [S1]Absolutely. Just imagine future smart customer service, audiobook reading, and those virtual livestreamers that are so popular now. The quality would improve dramatically. We might even have personal assistants using Asteroid to talk to us directly. How natural would that be? [S2]Yeah. That does sound exciting. When can we actually try it out? Are there any demos available? [S1]I haven't looked into that carefully yet. But since they've already announced it, I'm guessing it won't be long. I'm really eager to try it and see just how human-like it is. [S2]Yeah, yeah. If it can really deliver what they're promising, getting information and interacting with machines will be so much more convenient. The experience will be much better too. [S1]Exactly, exactly. We're just waiting for MoSi AI to give us this big surprise."
    
    # 对文本进行归一化，删除所有[S1][S2]标记
    import re
    normalized_text = re.sub(r'\[S[12]\]', '', text).strip()
    
    # 设置本地模型目录
    alignment_model_dir = '/inspire/hdd/project/embodied-multimodality/public/yqzhang/auto_evaluation_new/models/mms_fa'
    
    try:
        alignment_result = align_english_audio_text(audio_file, normalized_text, model_dir=alignment_model_dir)
        
        print("对齐结果:")
        for item in alignment_result:
            print(f"单词: '{item['transcript']}', 开始: {item['start']}s, 结束: {item['end']}s, 持续: {item['duration']}s")
            
    except Exception as e:
        print(f"对齐失败: {e}")