streaming-speech / omni_speech /infer /infer_minicpmo_fc.py
NMCxyz's picture
Add files using upload-large-folder tool
20e4eaa verified
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer, AutoProcessor
import librosa
from omni_speech.model import *
import json
from langchain_community.tools import DuckDuckGoSearchRun, DuckDuckGoSearchResults
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
import re
from duckduckgo_search import DDGS
# load omni model default, the default init_vision/init_audio/init_tts is True
# if load vision-only model, please set init_audio=False and init_tts=False
# if load audio-only model, please set init_vision=False
ddgs_instance = DDGS() # Initialize once
def search(query):
# search = DuckDuckGoSearchResults(output_format="list")
# search = DuckDuckGoSearchRun()
query = query.lower()
print("Query: ")
print(query)
# res = search.invoke(query)
res = ddgs_instance.text(query, max_results=2)
output = []
for result in res:
output.append({
'title': result['title'],
# 'link': result['href'],
'snippet': result['body']
})
return json.dumps(output, ensure_ascii=False)
def run_multiturn_conversation(messages, tokenizer, functions, available_functions):
# Step 1: send the conversation and available functions to GPT
response = model.chat(
msgs=messages,
tokenizer=tokenizer,
sampling=False,
max_new_tokens=128,
temperature=0.3,
generate_audio=False,
fc=functions
# chunk_input=False,
# processor=processor
)
# Step 2: check if GPT wanted to call a function
if ("<tool_calls>" in response) or (response[0] == "{") or ("<response>" in response):
response_message = response
print("Recommended Function call:")
print(response_message)
if response[0] == "{":
function = json.loads(response)
else:
response = re.sub(r"</?tool_call>", "", response).strip()
response = re.sub(r"</?response>", "", response).strip()
function = json.loads(response)
print("Function: ")
print(function)
# Step 3: call the function
# Note: the JSON response may not always be valid; be sure to handle errors
function_name = function["name"]
# verify function exists
if function_name not in available_functions:
return "Function " + function_name + " does not exist"
function_to_call = available_functions[function_name]
function_args = function["arguments"]
function_response = function_to_call(**function_args)
print("Output of function call:")
print(function_response)
# Step 4: send the info on the function call and function response to GPT
# adding assistant response to messages
messages.append(
{
"role": "assistant",
"function_call": [{
"type": "function",
"function": {"name": function_name, "arguments": function_args},
}],
"content": ""
}
)
# adding function response to messages
messages.append(
{
"role": "tool",
"name": function_name,
"content": function_response,
}
) # extend conversation with function response
print("Messages in next request:")
for message in messages:
print(message)
response = model.chat(
msgs=messages,
tokenizer=tokenizer,
sampling=False,
max_new_tokens=128,
temperature=0.3,
generate_audio=False,
fc=functions
# chunk_input=False,
# processor=processor
)
return response
model_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/minicpmo_sft_vi_fc"
audio_path = "/data1/speech/speechData/data_Vi/audio_Vi/reading/db_vinAI_recoring_cleaned/wav-cleaned/tay-ninh/3624328/127-1.wav"
# minipcmo_config = json.load(open("/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/scripts/minicpmp_config.json", "r"))
model = MiniCPMO.from_pretrained(
model_path,
trust_remote_code=True,
attn_implementation='sdpa', # sdpa or flash_attention_2
torch_dtype=torch.bfloat16,
init_vision=False,
init_audio=True,
init_tts=False,
processor_path="/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6",
# pretrained_encoder_path="/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/speech_encoder/whisper-medium",
# chunk_input=False,
# **minipcmo_config
)
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# processor = AutoProcessor.from_pretrained("/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6", trust_remote_code=True)
mimick_prompt = "Please transcribe this audio into text."
audio_input, _ = librosa.load(audio_path, sr=16000, mono=True)
system_message = """You are an assistant designed to help people answer questions.
You have access to query the web using Bing Search. You should call bing search whenever a question requires up to date information or could benefit from web data.
"""
# system_message = "You are a helpful language and speech assistant. You are able to understand the speech content that the user provides, and assist the user with a variety of tasks using natural language."
tools = [
{
"type": "function",
"function": {
"name": "search_bing",
"description": "Tìm kiếm trên Bing để lấy thông tin cập nhật từ web",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Truy vấn tìm kiếm",
}
},
"required": ["query"],
},
}
}
]
msgs = [{'role': 'system', 'content': system_message}]
# msgs.append({'role': 'user', 'content': [audio_input, mimick_prompt]})
# msgs.append({'role': 'user', 'content': [audio_input]})
msgs.append({'role': 'user', 'content': ["Chủ tịch nước hiện tại của Việt Nam là ai?"]})
available_functions = {'search_bing': search}
result = run_multiturn_conversation(msgs, tokenizer, tools, available_functions)
# result = model.chat(
# msgs=msgs,
# tokenizer=tokenizer,
# sampling=False,
# max_new_tokens=128,
# temperature=0.3,
# generate_audio=False,
# fc=tools
# # chunk_input=False,
# # processor=processor
# )
print("Prediction: ")
print(result)