|
|
import time |
|
|
import json |
|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoProcessor |
|
|
import librosa |
|
|
from omni_speech.model import * |
|
|
import os |
|
|
|
|
|
model_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6" |
|
|
audio_path = "/data1/speech/speechData/data_En/audio_En/English-copora/extractAll/spgispeech/train/8beac2a5cb0bd40b198e403650ed8041/68.wav" |
|
|
processor_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo" |
|
|
|
|
|
for filename in os.listdir(processor_path): |
|
|
source_file = os.path.join(processor_path, filename) |
|
|
|
|
|
if filename.endswith(".py") or filename.endswith(".json"): |
|
|
target_file = os.path.join(model_path, filename) |
|
|
|
|
|
try: |
|
|
os.remove(target_file) |
|
|
except: |
|
|
print(f"Created new symlink for: {filename}") |
|
|
|
|
|
os.symlink(source_file, target_file) |
|
|
|
|
|
model = MiniCPMO.from_pretrained( |
|
|
model_path, |
|
|
trust_remote_code=True, |
|
|
attn_implementation='sdpa', |
|
|
torch_dtype=torch.bfloat16, |
|
|
init_vision=False, |
|
|
init_audio=True, |
|
|
init_tts=False, |
|
|
processor_path=model_path, |
|
|
) |
|
|
|
|
|
model = model.eval().cuda() |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
|
|
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) |
|
|
|
|
|
audio_input, _ = librosa.load(audio_path, sr=16000, mono=True) |
|
|
|
|
|
audios = [[audio_input]] |
|
|
audio_parts = [[1]] |
|
|
|
|
|
audio_features, audio_feature_lens, _ = processor.audio_feature_extract( |
|
|
audios, audio_parts, chunk_input=True, sampling_rate=16000 |
|
|
) |
|
|
|
|
|
data = { |
|
|
"audio_features": audio_features, |
|
|
"audio_feature_lens": audio_feature_lens |
|
|
} |
|
|
|
|
|
res = model.get_audio_embedding(data, chunk_length=1) |
|
|
|
|
|
|
|
|
audio_embs = res[0][0] |
|
|
|
|
|
|
|
|
audio_embs = audio_embs.detach().to('cpu', dtype=torch.float32) |
|
|
audio_embeds = audio_embs.unsqueeze(0) |
|
|
print(f'audio_embeds: {audio_embeds}') |
|
|
print(audio_embeds.shape) |