streaming-speech / omni_speech /infer /infer_minicpmo_asr.py

Add files using upload-large-folder tool

20e4eaa verified 8 months ago

1.98 kB

	import torch
	from transformers import AutoTokenizer
	import librosa
	from omni_speech.model import *
	import os

	model_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/minicpmo_sft_asr_new"
	# audio_path = "/data1/speech/speechData/data_Vi/audio_Vi/reading/db_vinAI_recoring_cleaned/wav-cleaned/tay-ninh/3624328/127-1.wav"
	audio_path = "/data1/speech/speechData/data_Vi/audio_Vi/reading/db_vinAI_recoring_cleaned/wav-cleaned/phu-yen/3518892/194-1.wav"
	processor_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo"

	for filename in os.listdir(processor_path):
	source_file = os.path.join(processor_path, filename)

	if filename.endswith(".py") or filename.endswith(".json"):
	target_file = os.path.join(model_path, filename)

	try:
	os.remove(target_file)
	except:
	print(f"Created new symlink for: {filename}")

	os.symlink(source_file, target_file)

	model = MiniCPMO.from_pretrained(
	model_path,
	trust_remote_code=True,
	attn_implementation='sdpa', # sdpa or flash_attention_2
	torch_dtype=torch.bfloat16,
	init_vision=False,
	init_audio=True,
	init_tts=False,
	processor_path=model_path,
	)


	model = model.eval().cuda()
	tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

	mimick_prompt = "Please transcribe this audio into text."
	audio_input, _ = librosa.load(audio_path, sr=16000, mono=True)
	msgs = [{'role': 'system', 'content': 'You are a helpful language and speech assistant. You are able to understand the speech content that the user provides, and assist the user with a variety of tasks using natural language.'}]
	msgs.append({'role': 'user', 'content': [audio_input, mimick_prompt]})

	res = model.chat(
	msgs=msgs,
	tokenizer=tokenizer,
	sampling=False,
	max_new_tokens=128,
	temperature=0.3,
	generate_audio=False,
	)

	print("Prediction: ")
	print(res)