|
|
import torch |
|
|
from transformers import AutoTokenizer |
|
|
import librosa |
|
|
from omni_speech.model import * |
|
|
import os |
|
|
|
|
|
model_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/minicpmo_sft_asr_new" |
|
|
|
|
|
audio_path = "/data1/speech/speechData/data_Vi/audio_Vi/reading/db_vinAI_recoring_cleaned/wav-cleaned/phu-yen/3518892/194-1.wav" |
|
|
processor_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo" |
|
|
|
|
|
for filename in os.listdir(processor_path): |
|
|
source_file = os.path.join(processor_path, filename) |
|
|
|
|
|
if filename.endswith(".py") or filename.endswith(".json"): |
|
|
target_file = os.path.join(model_path, filename) |
|
|
|
|
|
try: |
|
|
os.remove(target_file) |
|
|
except: |
|
|
print(f"Created new symlink for: {filename}") |
|
|
|
|
|
os.symlink(source_file, target_file) |
|
|
|
|
|
model = MiniCPMO.from_pretrained( |
|
|
model_path, |
|
|
trust_remote_code=True, |
|
|
attn_implementation='sdpa', |
|
|
torch_dtype=torch.bfloat16, |
|
|
init_vision=False, |
|
|
init_audio=True, |
|
|
init_tts=False, |
|
|
processor_path=model_path, |
|
|
) |
|
|
|
|
|
|
|
|
model = model.eval().cuda() |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
|
|
|
|
|
mimick_prompt = "Please transcribe this audio into text." |
|
|
audio_input, _ = librosa.load(audio_path, sr=16000, mono=True) |
|
|
msgs = [{'role': 'system', 'content': 'You are a helpful language and speech assistant. You are able to understand the speech content that the user provides, and assist the user with a variety of tasks using natural language.'}] |
|
|
msgs.append({'role': 'user', 'content': [audio_input, mimick_prompt]}) |
|
|
|
|
|
res = model.chat( |
|
|
msgs=msgs, |
|
|
tokenizer=tokenizer, |
|
|
sampling=False, |
|
|
max_new_tokens=128, |
|
|
temperature=0.3, |
|
|
generate_audio=False, |
|
|
) |
|
|
|
|
|
print("Prediction: ") |
|
|
print(res) |