streaming-speech / omni_speech /infer /infer_minicpmo.py
NMCxyz's picture
Add files using upload-large-folder tool
20e4eaa verified
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
import librosa
# load omni model default, the default init_vision/init_audio/init_tts is True
# if load vision-only model, please set init_audio=False and init_tts=False
# if load audio-only model, please set init_vision=False
model = AutoModel.from_pretrained(
'/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6',
trust_remote_code=True,
attn_implementation='sdpa', # sdpa or flash_attention_2
torch_dtype=torch.bfloat16,
init_vision=True,
init_audio=True,
init_tts=True
)
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained('/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6', trust_remote_code=True)
# In addition to vision-only mode, tts processor and vocos also needs to be initialized
model.init_tts()
mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
audio_input, _ = librosa.load('/data1/speech/namnv59/datahub/Librispeech/test-clean/1089/134686/1089-134686-0001.wav', sr=16000, mono=True)
msgs = [{'role': 'user', 'content': [mimick_prompt,audio_input]}]
res = model.chat(
msgs=msgs,
tokenizer=tokenizer,
sampling=True,
max_new_tokens=128,
use_tts_template=True,
temperature=0.3,
generate_audio=True,
output_audio_path='output.wav', # save the tts result to output_audio_path
)
print(res["text"])