streaming-speech / omni_speech /infer /infer_minicpmo_fc.py

Add files using upload-large-folder tool

20e4eaa verified 8 months ago

6.92 kB

	import torch
	from PIL import Image
	from transformers import AutoModel, AutoTokenizer, AutoProcessor
	import librosa
	from omni_speech.model import *
	import json
	from langchain_community.tools import DuckDuckGoSearchRun, DuckDuckGoSearchResults
	from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
	import re
	from duckduckgo_search import DDGS

	# load omni model default, the default init_vision/init_audio/init_tts is True
	# if load vision-only model, please set init_audio=False and init_tts=False
	# if load audio-only model, please set init_vision=False
	ddgs_instance = DDGS() # Initialize once

	def search(query):
	# search = DuckDuckGoSearchResults(output_format="list")
	# search = DuckDuckGoSearchRun()
	query = query.lower()
	print("Query: ")
	print(query)
	# res = search.invoke(query)
	res = ddgs_instance.text(query, max_results=2)

	output = []

	for result in res:
	output.append({
	'title': result['title'],
	# 'link': result['href'],
	'snippet': result['body']
	})

	return json.dumps(output, ensure_ascii=False)

	def run_multiturn_conversation(messages, tokenizer, functions, available_functions):
	# Step 1: send the conversation and available functions to GPT
	response = model.chat(
	msgs=messages,
	tokenizer=tokenizer,
	sampling=False,
	max_new_tokens=128,
	temperature=0.3,
	generate_audio=False,
	fc=functions
	# chunk_input=False,
	# processor=processor
	)

	# Step 2: check if GPT wanted to call a function
	if ("<tool_calls>" in response) or (response[0] == "{") or ("<response>" in response):
	response_message = response
	print("Recommended Function call:")
	print(response_message)

	if response[0] == "{":
	function = json.loads(response)
	else:
	response = re.sub(r"</?tool_call>", "", response).strip()
	response = re.sub(r"</?response>", "", response).strip()
	function = json.loads(response)

	print("Function: ")
	print(function)

	# Step 3: call the function
	# Note: the JSON response may not always be valid; be sure to handle errors

	function_name = function["name"]

	# verify function exists
	if function_name not in available_functions:
	return "Function " + function_name + " does not exist"
	function_to_call = available_functions[function_name]

	function_args = function["arguments"]
	function_response = function_to_call(**function_args)

	print("Output of function call:")
	print(function_response)

	# Step 4: send the info on the function call and function response to GPT

	# adding assistant response to messages
	messages.append(
	{
	"role": "assistant",
	"function_call": [{
	"type": "function",
	"function": {"name": function_name, "arguments": function_args},
	}],
	"content": ""
	}
	)

	# adding function response to messages
	messages.append(
	{
	"role": "tool",
	"name": function_name,
	"content": function_response,
	}
	) # extend conversation with function response

	print("Messages in next request:")
	for message in messages:
	print(message)

	response = model.chat(
	msgs=messages,
	tokenizer=tokenizer,
	sampling=False,
	max_new_tokens=128,
	temperature=0.3,
	generate_audio=False,
	fc=functions
	# chunk_input=False,
	# processor=processor
	)

	return response

	model_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/minicpmo_sft_vi_fc"
	audio_path = "/data1/speech/speechData/data_Vi/audio_Vi/reading/db_vinAI_recoring_cleaned/wav-cleaned/tay-ninh/3624328/127-1.wav"
	# minipcmo_config = json.load(open("/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/scripts/minicpmp_config.json", "r"))
	model = MiniCPMO.from_pretrained(
	model_path,
	trust_remote_code=True,
	attn_implementation='sdpa', # sdpa or flash_attention_2
	torch_dtype=torch.bfloat16,
	init_vision=False,
	init_audio=True,
	init_tts=False,
	processor_path="/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6",
	# pretrained_encoder_path="/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/speech_encoder/whisper-medium",
	# chunk_input=False,
	# **minipcmo_config
	)


	model = model.eval().cuda()
	tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
	# processor = AutoProcessor.from_pretrained("/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6", trust_remote_code=True)

	mimick_prompt = "Please transcribe this audio into text."
	audio_input, _ = librosa.load(audio_path, sr=16000, mono=True)
	system_message = """You are an assistant designed to help people answer questions.
	You have access to query the web using Bing Search. You should call bing search whenever a question requires up to date information or could benefit from web data.
	"""
	# system_message = "You are a helpful language and speech assistant. You are able to understand the speech content that the user provides, and assist the user with a variety of tasks using natural language."
	tools = [
	{
	"type": "function",
	"function": {
	"name": "search_bing",
	"description": "Tìm kiếm trên Bing để lấy thông tin cập nhật từ web",
	"parameters": {
	"type": "object",
	"properties": {
	"query": {
	"type": "string",
	"description": "Truy vấn tìm kiếm",
	}
	},
	"required": ["query"],
	},
	}
	}

	]
	msgs = [{'role': 'system', 'content': system_message}]
	# msgs.append({'role': 'user', 'content': [audio_input, mimick_prompt]})
	# msgs.append({'role': 'user', 'content': [audio_input]})
	msgs.append({'role': 'user', 'content': ["Chủ tịch nước hiện tại của Việt Nam là ai?"]})


	available_functions = {'search_bing': search}

	result = run_multiturn_conversation(msgs, tokenizer, tools, available_functions)

	# result = model.chat(
	# msgs=msgs,
	# tokenizer=tokenizer,
	# sampling=False,
	# max_new_tokens=128,
	# temperature=0.3,
	# generate_audio=False,
	# fc=tools
	# # chunk_input=False,
	# # processor=processor
	# )

	print("Prediction: ")
	print(result)