File size: 2,026 Bytes
20e4eaa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import time
import json
import torch
from transformers import AutoTokenizer, AutoProcessor
import librosa
from omni_speech.model import *
import os

model_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6"
audio_path = "/data1/speech/speechData/data_En/audio_En/English-copora/extractAll/spgispeech/train/8beac2a5cb0bd40b198e403650ed8041/68.wav"
processor_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo"

for filename in os.listdir(processor_path):
    source_file = os.path.join(processor_path, filename)
    
    if filename.endswith(".py") or filename.endswith(".json"):
        target_file = os.path.join(model_path, filename)
        
        try:
            os.remove(target_file)
        except:
            print(f"Created new symlink for: {filename}")
        
        os.symlink(source_file, target_file)

model = MiniCPMO.from_pretrained(
    model_path,
    trust_remote_code=True,
    attn_implementation='sdpa', # sdpa or flash_attention_2
    torch_dtype=torch.bfloat16,
    init_vision=False,
    init_audio=True,
    init_tts=False,
    processor_path=model_path,
)

model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

audio_input, _ = librosa.load(audio_path, sr=16000, mono=True)

audios = [[audio_input]]
audio_parts = [[1]]

audio_features, audio_feature_lens, _ = processor.audio_feature_extract(
    audios, audio_parts, chunk_input=True, sampling_rate=16000
)

data = {
    "audio_features": audio_features,
    "audio_feature_lens": audio_feature_lens
}

res = model.get_audio_embedding(data, chunk_length=1)

# Extract tensor from nested list
audio_embs = res[0][0]

# Move tensor to CPU
audio_embs = audio_embs.detach().to('cpu', dtype=torch.float32)
audio_embeds = audio_embs.unsqueeze(0)
print(f'audio_embeds: {audio_embeds}')
print(audio_embeds.shape)