import torch
from transformers import AutoTokenizer
import librosa
from omni_speech.model import *
import os

model_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/minicpmo_sft_asr_new"
# audio_path = "/data1/speech/speechData/data_Vi/audio_Vi/reading/db_vinAI_recoring_cleaned/wav-cleaned/tay-ninh/3624328/127-1.wav"
audio_path = "/data1/speech/speechData/data_Vi/audio_Vi/reading/db_vinAI_recoring_cleaned/wav-cleaned/phu-yen/3518892/194-1.wav"
processor_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo"

for filename in os.listdir(processor_path):
    source_file = os.path.join(processor_path, filename)
    
    if filename.endswith(".py") or filename.endswith(".json"):
        target_file = os.path.join(model_path, filename)
        
        try:
            os.remove(target_file)
        except:
            print(f"Created new symlink for: {filename}")
        
        os.symlink(source_file, target_file)

model = MiniCPMO.from_pretrained(
    model_path,
    trust_remote_code=True,
    attn_implementation='sdpa', # sdpa or flash_attention_2
    torch_dtype=torch.bfloat16,
    init_vision=False,
    init_audio=True,
    init_tts=False,
    processor_path=model_path,
)


model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

mimick_prompt = "Please transcribe this audio into text."
audio_input, _ = librosa.load(audio_path, sr=16000, mono=True)
msgs = [{'role': 'system', 'content': 'You are a helpful language and speech assistant. You are able to understand the speech content that the user provides, and assist the user with a variety of tasks using natural language.'}]
msgs.append({'role': 'user', 'content': [audio_input, mimick_prompt]})

res = model.chat(
    msgs=msgs,
    tokenizer=tokenizer,
    sampling=False,
    max_new_tokens=128,
    temperature=0.3,
    generate_audio=False,
)

print("Prediction: ")
print(res)