streaming-speech / omni_speech /infer /infer_minicpmo_asr.py
NMCxyz's picture
Add files using upload-large-folder tool
20e4eaa verified
import torch
from transformers import AutoTokenizer
import librosa
from omni_speech.model import *
import os
model_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/minicpmo_sft_asr_new"
# audio_path = "/data1/speech/speechData/data_Vi/audio_Vi/reading/db_vinAI_recoring_cleaned/wav-cleaned/tay-ninh/3624328/127-1.wav"
audio_path = "/data1/speech/speechData/data_Vi/audio_Vi/reading/db_vinAI_recoring_cleaned/wav-cleaned/phu-yen/3518892/194-1.wav"
processor_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo"
for filename in os.listdir(processor_path):
source_file = os.path.join(processor_path, filename)
if filename.endswith(".py") or filename.endswith(".json"):
target_file = os.path.join(model_path, filename)
try:
os.remove(target_file)
except:
print(f"Created new symlink for: {filename}")
os.symlink(source_file, target_file)
model = MiniCPMO.from_pretrained(
model_path,
trust_remote_code=True,
attn_implementation='sdpa', # sdpa or flash_attention_2
torch_dtype=torch.bfloat16,
init_vision=False,
init_audio=True,
init_tts=False,
processor_path=model_path,
)
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
mimick_prompt = "Please transcribe this audio into text."
audio_input, _ = librosa.load(audio_path, sr=16000, mono=True)
msgs = [{'role': 'system', 'content': 'You are a helpful language and speech assistant. You are able to understand the speech content that the user provides, and assist the user with a variety of tasks using natural language.'}]
msgs.append({'role': 'user', 'content': [audio_input, mimick_prompt]})
res = model.chat(
msgs=msgs,
tokenizer=tokenizer,
sampling=False,
max_new_tokens=128,
temperature=0.3,
generate_audio=False,
)
print("Prediction: ")
print(res)