streaming-speech / omni_speech /infer /infer_minicpmo_audio_embs.py
NMCxyz's picture
Add files using upload-large-folder tool
20e4eaa verified
import time
import json
import torch
from transformers import AutoTokenizer, AutoProcessor
import librosa
from omni_speech.model import *
import os
model_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6"
audio_path = "/data1/speech/speechData/data_En/audio_En/English-copora/extractAll/spgispeech/train/8beac2a5cb0bd40b198e403650ed8041/68.wav"
processor_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo"
for filename in os.listdir(processor_path):
source_file = os.path.join(processor_path, filename)
if filename.endswith(".py") or filename.endswith(".json"):
target_file = os.path.join(model_path, filename)
try:
os.remove(target_file)
except:
print(f"Created new symlink for: {filename}")
os.symlink(source_file, target_file)
model = MiniCPMO.from_pretrained(
model_path,
trust_remote_code=True,
attn_implementation='sdpa', # sdpa or flash_attention_2
torch_dtype=torch.bfloat16,
init_vision=False,
init_audio=True,
init_tts=False,
processor_path=model_path,
)
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
audio_input, _ = librosa.load(audio_path, sr=16000, mono=True)
audios = [[audio_input]]
audio_parts = [[1]]
audio_features, audio_feature_lens, _ = processor.audio_feature_extract(
audios, audio_parts, chunk_input=True, sampling_rate=16000
)
data = {
"audio_features": audio_features,
"audio_feature_lens": audio_feature_lens
}
res = model.get_audio_embedding(data, chunk_length=1)
# Extract tensor from nested list
audio_embs = res[0][0]
# Move tensor to CPU
audio_embs = audio_embs.detach().to('cpu', dtype=torch.float32)
audio_embeds = audio_embs.unsqueeze(0)
print(f'audio_embeds: {audio_embeds}')
print(audio_embeds.shape)