| import torch | |
| from PIL import Image | |
| from transformers import AutoModel, AutoTokenizer | |
| import librosa | |
| # load omni model default, the default init_vision/init_audio/init_tts is True | |
| # if load vision-only model, please set init_audio=False and init_tts=False | |
| # if load audio-only model, please set init_vision=False | |
| model = AutoModel.from_pretrained( | |
| '/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6', | |
| trust_remote_code=True, | |
| attn_implementation='sdpa', # sdpa or flash_attention_2 | |
| torch_dtype=torch.bfloat16, | |
| init_vision=True, | |
| init_audio=True, | |
| init_tts=True | |
| ) | |
| model = model.eval().cuda() | |
| tokenizer = AutoTokenizer.from_pretrained('/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6', trust_remote_code=True) | |
| # In addition to vision-only mode, tts processor and vocos also needs to be initialized | |
| model.init_tts() | |
| mimick_prompt = "Please repeat each user's speech, including voice style and speech content." | |
| audio_input, _ = librosa.load('/data1/speech/namnv59/datahub/Librispeech/test-clean/1089/134686/1089-134686-0001.wav', sr=16000, mono=True) | |
| msgs = [{'role': 'user', 'content': [mimick_prompt,audio_input]}] | |
| res = model.chat( | |
| msgs=msgs, | |
| tokenizer=tokenizer, | |
| sampling=True, | |
| max_new_tokens=128, | |
| use_tts_template=True, | |
| temperature=0.3, | |
| generate_audio=True, | |
| output_audio_path='output.wav', # save the tts result to output_audio_path | |
| ) | |
| print(res["text"]) |