|
|
import re |
|
|
import torch |
|
|
import torchaudio.functional as F |
|
|
import torchaudio |
|
|
import uroman as ur |
|
|
import logging |
|
|
from typing import List, Dict, Any, Optional |
|
|
|
|
|
def split_and_merge_punctuation(text: str) -> List[str]: |
|
|
""" |
|
|
处理英文文本,按空格分词并将标点符号合并到前面的单词 |
|
|
|
|
|
Args: |
|
|
text: 输入的英文文本 |
|
|
|
|
|
Returns: |
|
|
处理后的单词列表,标点符号已合并到对应单词 |
|
|
""" |
|
|
|
|
|
elements = text.split() |
|
|
|
|
|
|
|
|
result = [] |
|
|
|
|
|
|
|
|
for ele in elements: |
|
|
|
|
|
parts = re.findall(r'[a-zA-Z0-9]+|[^\w\s]+', ele) |
|
|
|
|
|
|
|
|
merged_parts = [] |
|
|
|
|
|
for i in range(len(parts)): |
|
|
if i % 2 == 0: |
|
|
|
|
|
merged_parts.append(parts[i]) |
|
|
else: |
|
|
|
|
|
if merged_parts: |
|
|
merged_parts[-1] += parts[i] |
|
|
else: |
|
|
merged_parts.append(parts[i]) |
|
|
|
|
|
|
|
|
result.extend(merged_parts) |
|
|
|
|
|
return result |
|
|
|
|
|
def restore_spaces_in_english_text(tokens: List[str]) -> str: |
|
|
""" |
|
|
在英文单词之间恢复空格 |
|
|
|
|
|
Args: |
|
|
tokens: 单词列表 |
|
|
|
|
|
Returns: |
|
|
恢复空格后的文本 |
|
|
""" |
|
|
result = [] |
|
|
for i, token in enumerate(tokens): |
|
|
|
|
|
if i > 0 and token[0].isalnum() and not any(p in tokens[i-1] for p in ',.!?;:()[]<>\'\"…'): |
|
|
result.append(" ") |
|
|
result.append(token) |
|
|
|
|
|
return "".join(result) |
|
|
|
|
|
def get_aligned_result_with_punctuation(alignment_result: List[Dict], text: str) -> List[Dict]: |
|
|
""" |
|
|
将对齐结果转换为包含标点符号的格式 |
|
|
|
|
|
Args: |
|
|
alignment_result: 原始对齐结果 |
|
|
text: 原始文本 |
|
|
|
|
|
Returns: |
|
|
处理后的对齐结果,标点符号已合并 |
|
|
""" |
|
|
text_tokens = split_and_merge_punctuation(text) |
|
|
|
|
|
updated_alignment_result = [] |
|
|
token_idx = 0 |
|
|
|
|
|
for index, align_item in enumerate(alignment_result): |
|
|
if token_idx >= len(text_tokens): |
|
|
break |
|
|
|
|
|
start = align_item["start"] |
|
|
end = align_item["end"] |
|
|
text_token = text_tokens[token_idx] |
|
|
|
|
|
updated_item = { |
|
|
"start": start, |
|
|
"end": end, |
|
|
"transcript": text_token |
|
|
} |
|
|
|
|
|
|
|
|
updated_item.update({key: align_item[key] for key in align_item |
|
|
if key not in ["start", "end", "transcript"]}) |
|
|
|
|
|
updated_alignment_result.append(updated_item) |
|
|
token_idx += 1 |
|
|
|
|
|
return updated_alignment_result |
|
|
|
|
|
class EnglishAlignmentModel: |
|
|
def __init__(self, device: str = "cuda", model_dir: Optional[str] = None): |
|
|
""" |
|
|
初始化英文对齐模型 |
|
|
|
|
|
Args: |
|
|
device: 设备类型 ("cuda" 或 "cpu") |
|
|
model_dir: 模型目录路径,如果为None则使用默认路径 |
|
|
""" |
|
|
self.device = torch.device(device) |
|
|
self.bundle = torchaudio.pipelines.MMS_FA |
|
|
|
|
|
|
|
|
dl_kwargs = {} |
|
|
if model_dir: |
|
|
dl_kwargs['model_dir'] = model_dir |
|
|
|
|
|
self.align_model = self.bundle.get_model( |
|
|
with_star=False, |
|
|
dl_kwargs=dl_kwargs |
|
|
).to(self.device) |
|
|
|
|
|
self.uroman = ur.Uroman() |
|
|
self.DICTIONARY = self.bundle.get_dict() |
|
|
|
|
|
def align(self, emission: torch.Tensor, tokens: torch.Tensor): |
|
|
""" |
|
|
执行强对齐 |
|
|
|
|
|
Args: |
|
|
emission: 模型的输出 |
|
|
tokens: 目标tokens |
|
|
|
|
|
Returns: |
|
|
对齐的tokens和分数 |
|
|
""" |
|
|
alignments, scores = F.forced_align( |
|
|
log_probs=emission, |
|
|
targets=tokens, |
|
|
blank=0 |
|
|
) |
|
|
alignments, scores = alignments[0], scores[0] |
|
|
scores = scores.exp() |
|
|
return alignments, scores |
|
|
|
|
|
def unflatten(self, list_: List, lengths: List[int]) -> List[List]: |
|
|
""" |
|
|
将一个长列表按照长度拆分成子列表 |
|
|
|
|
|
Args: |
|
|
list_: 长列表 |
|
|
lengths: 各子列表的长度 |
|
|
|
|
|
Returns: |
|
|
拆分后的子列表 |
|
|
""" |
|
|
assert len(list_) == sum(lengths) |
|
|
i = 0 |
|
|
ret = [] |
|
|
for l in lengths: |
|
|
ret.append(list_[i:i + l]) |
|
|
i += l |
|
|
return ret |
|
|
|
|
|
def preview_word(self, waveform: torch.Tensor, spans: List, num_frames: int, |
|
|
transcript: List[str], sample_rate: int) -> List[Dict]: |
|
|
""" |
|
|
生成每个单词的时间对齐信息 |
|
|
|
|
|
Args: |
|
|
waveform: 音频波形 |
|
|
spans: 单词的跨度 |
|
|
num_frames: 帧数 |
|
|
transcript: 转录文本单词列表 |
|
|
sample_rate: 采样率 |
|
|
|
|
|
Returns: |
|
|
单词的对齐信息列表 |
|
|
""" |
|
|
end = 0 |
|
|
alignment_result = [] |
|
|
|
|
|
for span, trans in zip(spans, transcript): |
|
|
ratio = waveform.size(1) / num_frames |
|
|
x0 = int(ratio * span[0].start) |
|
|
x1 = int(ratio * span[-1].end) |
|
|
|
|
|
align_info = { |
|
|
"transcript": trans, |
|
|
"start": round(x0 / sample_rate, 3), |
|
|
"end": round(x1 / sample_rate, 3) |
|
|
} |
|
|
align_info["pause"] = round(align_info["start"] - end, 3) |
|
|
align_info["duration"] = round(align_info["end"] - align_info["start"], 3) |
|
|
end = align_info["end"] |
|
|
alignment_result.append(align_info) |
|
|
|
|
|
return alignment_result |
|
|
|
|
|
def make_wav_batch(self, wav_list: List[torch.Tensor]): |
|
|
""" |
|
|
将wav_list中的每个wav张量填充为相同的长度 |
|
|
|
|
|
Args: |
|
|
wav_list: wav文件列表 |
|
|
|
|
|
Returns: |
|
|
填充后的音频张量和原始长度 |
|
|
""" |
|
|
wav_lengths = torch.tensor([wav.size(0) for wav in wav_list], dtype=torch.long) |
|
|
max_length = max(wav_lengths) |
|
|
wavs_tensors = torch.zeros(len(wav_list), max_length, device=wav_list[0].device) |
|
|
|
|
|
for i, wav in enumerate(wav_list): |
|
|
wavs_tensors[i, :wav_lengths[i]] = wav |
|
|
|
|
|
return wavs_tensors, wav_lengths.to(wavs_tensors.device) |
|
|
|
|
|
def get_target(self, transcript: str) -> torch.Tensor: |
|
|
""" |
|
|
获取给定英文转录文本的目标tokens |
|
|
|
|
|
Args: |
|
|
transcript: 英文转录文本 |
|
|
|
|
|
Returns: |
|
|
转录文本的目标tokens |
|
|
""" |
|
|
|
|
|
transcript = re.sub(r'[^\w\s]', r' ', transcript) |
|
|
words = transcript.lower().split() |
|
|
|
|
|
|
|
|
star_token = self.DICTIONARY['*'] |
|
|
|
|
|
|
|
|
tokenized_transcript = [] |
|
|
for word in words: |
|
|
tokenized_transcript.extend([ |
|
|
self.DICTIONARY[c] if c in self.DICTIONARY and c != '-' else star_token |
|
|
for c in word |
|
|
]) |
|
|
|
|
|
return torch.tensor([tokenized_transcript], dtype=torch.int32, device=self.device) |
|
|
|
|
|
def get_alignment_result(self, emission_padded: torch.Tensor, emission_length: int, |
|
|
aligned_tokens: torch.Tensor, alignment_scores: torch.Tensor, |
|
|
transcript: str, waveform: torch.Tensor) -> List[Dict]: |
|
|
""" |
|
|
根据给定的emission和对齐信息生成对齐结果 |
|
|
|
|
|
Args: |
|
|
emission_padded: 填充后的emission |
|
|
emission_length: emission的有效长度 |
|
|
aligned_tokens: 对齐的tokens |
|
|
alignment_scores: 对齐的分数 |
|
|
transcript: 转录文本 |
|
|
waveform: 音频波形 |
|
|
|
|
|
Returns: |
|
|
对齐结果 |
|
|
""" |
|
|
|
|
|
processed_transcript = re.sub(r'[^\w\s]', r' ', transcript) |
|
|
words = processed_transcript.lower().split() |
|
|
|
|
|
emission = emission_padded[:emission_length, :].unsqueeze(0) |
|
|
token_spans = F.merge_tokens(aligned_tokens, alignment_scores) |
|
|
word_spans = self.unflatten(token_spans, [len(word) for word in words]) |
|
|
num_frames = emission.size(1) |
|
|
|
|
|
return self.preview_word(waveform.unsqueeze(0), word_spans, num_frames, |
|
|
words, self.bundle.sample_rate) |
|
|
|
|
|
def align_audio_text(self, waveform: torch.Tensor, transcript: str) -> List[Dict]: |
|
|
""" |
|
|
对单个音频和文本进行对齐 |
|
|
|
|
|
Args: |
|
|
waveform: 音频波形张量 (1D tensor) |
|
|
transcript: 英文转录文本 |
|
|
|
|
|
Returns: |
|
|
对齐结果列表,包含每个单词的时间信息 |
|
|
""" |
|
|
|
|
|
waveform = waveform.to(self.device) |
|
|
|
|
|
|
|
|
if hasattr(self, 'original_sample_rate'): |
|
|
if self.original_sample_rate != self.bundle.sample_rate: |
|
|
waveform = F.resample(waveform, self.original_sample_rate, self.bundle.sample_rate) |
|
|
|
|
|
|
|
|
return self.batch_alignment([waveform], [transcript])[0] |
|
|
|
|
|
def batch_alignment(self, wav_list: List[torch.Tensor], transcript_list: List[str]) -> List[List[Dict]]: |
|
|
""" |
|
|
批量对齐 |
|
|
|
|
|
Args: |
|
|
wav_list: wav文件列表 |
|
|
transcript_list: 转录文本列表 |
|
|
|
|
|
Returns: |
|
|
对齐结果列表 |
|
|
""" |
|
|
wavs_tensors, wavs_lengths_tensor = self.make_wav_batch(wav_list) |
|
|
|
|
|
|
|
|
with torch.inference_mode(): |
|
|
emission, emission_lengths = self.align_model( |
|
|
wavs_tensors.to(self.device), |
|
|
wavs_lengths_tensor |
|
|
) |
|
|
|
|
|
star_dim = torch.zeros( |
|
|
(emission.shape[0], emission.size(1), 1), |
|
|
dtype=emission.dtype, |
|
|
device=self.device |
|
|
) |
|
|
emission = torch.cat((emission, star_dim), dim=-1) |
|
|
|
|
|
|
|
|
target_list = [self.get_target(transcript) for transcript in transcript_list] |
|
|
|
|
|
|
|
|
align_results = [ |
|
|
self.align(emission_padded[:emission_length, :].unsqueeze(0), target) |
|
|
for emission_padded, emission_length, target in zip(emission, emission_lengths, target_list) |
|
|
] |
|
|
|
|
|
batch_aligned_tokens = [align_result[0] for align_result in align_results] |
|
|
batch_alignment_scores = [align_result[1] for align_result in align_results] |
|
|
|
|
|
|
|
|
alignment_result_list = [ |
|
|
self.get_alignment_result(emission_padded, emission_length, aligned_tokens, |
|
|
alignment_scores, transcript, waveform) |
|
|
for emission_padded, emission_length, aligned_tokens, alignment_scores, transcript, waveform |
|
|
in zip(emission, emission_lengths, batch_aligned_tokens, batch_alignment_scores, |
|
|
transcript_list, wav_list) |
|
|
] |
|
|
|
|
|
|
|
|
final_results = [] |
|
|
for alignment_result, transcript in zip(alignment_result_list, transcript_list): |
|
|
processed_result = get_aligned_result_with_punctuation(alignment_result, transcript) |
|
|
final_results.append(processed_result) |
|
|
|
|
|
return final_results |
|
|
|
|
|
def align_english_audio_text(audio_path: str, transcript: str, device: str = "cuda", |
|
|
model_dir: Optional[str] = None) -> List[Dict]: |
|
|
""" |
|
|
便捷函数:对英文音频和文本进行对齐 |
|
|
|
|
|
Args: |
|
|
audio_path: 音频文件路径 |
|
|
transcript: 英文转录文本 |
|
|
device: 设备类型 ("cuda" 或 "cpu") |
|
|
model_dir: 模型目录路径 |
|
|
|
|
|
Returns: |
|
|
对齐结果列表,包含每个单词的时间信息 |
|
|
|
|
|
Example: |
|
|
>>> result = align_english_audio_text("audio.wav", "Hello world!") |
|
|
>>> print(result) |
|
|
[ |
|
|
{"transcript": "Hello", "start": 0.0, "end": 0.5, "duration": 0.5, "pause": 0.0}, |
|
|
{"transcript": "world!", "start": 0.6, "end": 1.2, "duration": 0.6, "pause": 0.1} |
|
|
] |
|
|
""" |
|
|
|
|
|
waveform, sample_rate = torchaudio.load(audio_path) |
|
|
|
|
|
|
|
|
if waveform.size(0) > 1: |
|
|
waveform = torch.mean(waveform, dim=0, keepdim=True) |
|
|
waveform = waveform.squeeze(0) |
|
|
|
|
|
|
|
|
model = EnglishAlignmentModel(device=device, model_dir=model_dir) |
|
|
model.original_sample_rate = sample_rate |
|
|
|
|
|
|
|
|
return model.align_audio_text(waveform, transcript) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
audio_file = "/inspire/hdd/project/embodied-multimodality/public/yqzhang/infer_res/from_newckpt_step40000/test_en/gpu4/output_0.wav" |
|
|
text = "[S1]Hey, did you hear about that company called MoSi AI? [S2]MoSi AI? Yeah, I think I've heard of them. Aren't they the ones doing AI stuff? What new thing have they come up with now? [S1]Yeah, that's them! They recently launched this super hot new product called, um, Asteroid. [S2]Asteroid. That's a pretty cool name. Does it mean like the space rock? [S1]Yeah, I think that's what it means. Let me tell you, this thing is incredible. They say it's currently the most realistic, human-like conversational TTS model out there. [S2]Oh, TTS technology? You mean the text-to-speech thing? Aren't there already a lot of those on the market? What makes this one so special? [S1]Well, it's completely different. They say the voice produced by Asteroid sounds almost exactly like a real person talking. And it's super smooth and natural. Not at all like, you know, that stiff robotic tone. [S2]I see. Some voice assistants do still have that mechanical feel, especially during multi-turn conversations. So how amazing is this Asteroid exactly? [S1]I heard they internally call Asteroid China's own version of NotebookLM. [S2]NotebookLM? Oh, I know that one. Isn't that the personal AI that Google made? The one that helps organize notes and answers all kinds of questions? So Asteroid has similar functions? [S1]Right. That's probably what they mean. It's not just that the voice sounds incredibly human. The intelligence level is also really high. It can have these really logical, contextual, in-depth conversations with you. It's just like chatting with a real person. [S2]Wow, that sounds amazing. If they can really achieve that... [S1]Yeah, it's basically like having a personal assistant that's both articulate and really understands you. [S2]Hmm. That does sound appealing. [S1]And some people are saying it's like the, what's it called again in the voice technology circle? Oh right, DeepSeek. [S2]DeepSeek? Isn't that the company making large language models? Their models are pretty popular now. That's high praise. So they're saying Asteroid is top-tier technology? [S1]Yeah, I think that's what they mean. It's like they've reached a whole new level in voice synthesis. Similar to the impact DeepSeek has had in natural language processing. It could be that kind of groundbreaking technology. [S2]If Asteroid is really that impressive, where could it be used? I feel like there must be huge potential there. [S1]Absolutely. Just imagine future smart customer service, audiobook reading, and those virtual livestreamers that are so popular now. The quality would improve dramatically. We might even have personal assistants using Asteroid to talk to us directly. How natural would that be? [S2]Yeah. That does sound exciting. When can we actually try it out? Are there any demos available? [S1]I haven't looked into that carefully yet. But since they've already announced it, I'm guessing it won't be long. I'm really eager to try it and see just how human-like it is. [S2]Yeah, yeah. If it can really deliver what they're promising, getting information and interacting with machines will be so much more convenient. The experience will be much better too. [S1]Exactly, exactly. We're just waiting for MoSi AI to give us this big surprise." |
|
|
|
|
|
|
|
|
import re |
|
|
normalized_text = re.sub(r'\[S[12]\]', '', text).strip() |
|
|
|
|
|
|
|
|
alignment_model_dir = '/inspire/hdd/project/embodied-multimodality/public/yqzhang/auto_evaluation_new/models/mms_fa' |
|
|
|
|
|
try: |
|
|
alignment_result = align_english_audio_text(audio_file, normalized_text, model_dir=alignment_model_dir) |
|
|
|
|
|
print("对齐结果:") |
|
|
for item in alignment_result: |
|
|
print(f"单词: '{item['transcript']}', 开始: {item['start']}s, 结束: {item['end']}s, 持续: {item['duration']}s") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"对齐失败: {e}") |