import torch from transformers import AutoTokenizer import librosa from omni_speech.model import * import os model_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/minicpmo_sft_asr_new" # audio_path = "/data1/speech/speechData/data_Vi/audio_Vi/reading/db_vinAI_recoring_cleaned/wav-cleaned/tay-ninh/3624328/127-1.wav" audio_path = "/data1/speech/speechData/data_Vi/audio_Vi/reading/db_vinAI_recoring_cleaned/wav-cleaned/phu-yen/3518892/194-1.wav" processor_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo" for filename in os.listdir(processor_path): source_file = os.path.join(processor_path, filename) if filename.endswith(".py") or filename.endswith(".json"): target_file = os.path.join(model_path, filename) try: os.remove(target_file) except: print(f"Created new symlink for: {filename}") os.symlink(source_file, target_file) model = MiniCPMO.from_pretrained( model_path, trust_remote_code=True, attn_implementation='sdpa', # sdpa or flash_attention_2 torch_dtype=torch.bfloat16, init_vision=False, init_audio=True, init_tts=False, processor_path=model_path, ) model = model.eval().cuda() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) mimick_prompt = "Please transcribe this audio into text." audio_input, _ = librosa.load(audio_path, sr=16000, mono=True) msgs = [{'role': 'system', 'content': 'You are a helpful language and speech assistant. You are able to understand the speech content that the user provides, and assist the user with a variety of tasks using natural language.'}] msgs.append({'role': 'user', 'content': [audio_input, mimick_prompt]}) res = model.chat( msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=128, temperature=0.3, generate_audio=False, ) print("Prediction: ") print(res)