streaming-speech / omni_speech /infer /infer_minicpmo_audio_embs.py

Add files using upload-large-folder tool

20e4eaa verified 8 months ago

2.03 kB

	import time
	import json
	import torch
	from transformers import AutoTokenizer, AutoProcessor
	import librosa
	from omni_speech.model import *
	import os

	model_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6"
	audio_path = "/data1/speech/speechData/data_En/audio_En/English-copora/extractAll/spgispeech/train/8beac2a5cb0bd40b198e403650ed8041/68.wav"
	processor_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo"

	for filename in os.listdir(processor_path):
	source_file = os.path.join(processor_path, filename)

	if filename.endswith(".py") or filename.endswith(".json"):
	target_file = os.path.join(model_path, filename)

	try:
	os.remove(target_file)
	except:
	print(f"Created new symlink for: {filename}")

	os.symlink(source_file, target_file)

	model = MiniCPMO.from_pretrained(
	model_path,
	trust_remote_code=True,
	attn_implementation='sdpa', # sdpa or flash_attention_2
	torch_dtype=torch.bfloat16,
	init_vision=False,
	init_audio=True,
	init_tts=False,
	processor_path=model_path,
	)

	model = model.eval().cuda()
	tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
	processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

	audio_input, _ = librosa.load(audio_path, sr=16000, mono=True)

	audios = [[audio_input]]
	audio_parts = [[1]]

	audio_features, audio_feature_lens, _ = processor.audio_feature_extract(
	audios, audio_parts, chunk_input=True, sampling_rate=16000
	)

	data = {
	"audio_features": audio_features,
	"audio_feature_lens": audio_feature_lens
	}

	res = model.get_audio_embedding(data, chunk_length=1)

	# Extract tensor from nested list
	audio_embs = res[0][0]

	# Move tensor to CPU
	audio_embs = audio_embs.detach().to('cpu', dtype=torch.float32)
	audio_embeds = audio_embs.unsqueeze(0)
	print(f'audio_embeds: {audio_embeds}')
	print(audio_embeds.shape)