|
|
import torch |
|
|
from PIL import Image |
|
|
from transformers import AutoModel, AutoTokenizer, AutoProcessor |
|
|
import librosa |
|
|
from omni_speech.model import * |
|
|
import json |
|
|
from langchain_community.tools import DuckDuckGoSearchRun, DuckDuckGoSearchResults |
|
|
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper |
|
|
import re |
|
|
from duckduckgo_search import DDGS |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ddgs_instance = DDGS() |
|
|
|
|
|
def search(query): |
|
|
|
|
|
|
|
|
query = query.lower() |
|
|
print("Query: ") |
|
|
print(query) |
|
|
|
|
|
res = ddgs_instance.text(query, max_results=2) |
|
|
|
|
|
output = [] |
|
|
|
|
|
for result in res: |
|
|
output.append({ |
|
|
'title': result['title'], |
|
|
|
|
|
'snippet': result['body'] |
|
|
}) |
|
|
|
|
|
return json.dumps(output, ensure_ascii=False) |
|
|
|
|
|
def run_multiturn_conversation(messages, tokenizer, functions, available_functions): |
|
|
|
|
|
response = model.chat( |
|
|
msgs=messages, |
|
|
tokenizer=tokenizer, |
|
|
sampling=False, |
|
|
max_new_tokens=128, |
|
|
temperature=0.3, |
|
|
generate_audio=False, |
|
|
fc=functions |
|
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
if ("<tool_calls>" in response) or (response[0] == "{") or ("<response>" in response): |
|
|
response_message = response |
|
|
print("Recommended Function call:") |
|
|
print(response_message) |
|
|
|
|
|
if response[0] == "{": |
|
|
function = json.loads(response) |
|
|
else: |
|
|
response = re.sub(r"</?tool_call>", "", response).strip() |
|
|
response = re.sub(r"</?response>", "", response).strip() |
|
|
function = json.loads(response) |
|
|
|
|
|
print("Function: ") |
|
|
print(function) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function_name = function["name"] |
|
|
|
|
|
|
|
|
if function_name not in available_functions: |
|
|
return "Function " + function_name + " does not exist" |
|
|
function_to_call = available_functions[function_name] |
|
|
|
|
|
function_args = function["arguments"] |
|
|
function_response = function_to_call(**function_args) |
|
|
|
|
|
print("Output of function call:") |
|
|
print(function_response) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
messages.append( |
|
|
{ |
|
|
"role": "assistant", |
|
|
"function_call": [{ |
|
|
"type": "function", |
|
|
"function": {"name": function_name, "arguments": function_args}, |
|
|
}], |
|
|
"content": "" |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
messages.append( |
|
|
{ |
|
|
"role": "tool", |
|
|
"name": function_name, |
|
|
"content": function_response, |
|
|
} |
|
|
) |
|
|
|
|
|
print("Messages in next request:") |
|
|
for message in messages: |
|
|
print(message) |
|
|
|
|
|
response = model.chat( |
|
|
msgs=messages, |
|
|
tokenizer=tokenizer, |
|
|
sampling=False, |
|
|
max_new_tokens=128, |
|
|
temperature=0.3, |
|
|
generate_audio=False, |
|
|
fc=functions |
|
|
|
|
|
|
|
|
) |
|
|
|
|
|
return response |
|
|
|
|
|
model_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/minicpmo_sft_vi_fc" |
|
|
audio_path = "/data1/speech/speechData/data_Vi/audio_Vi/reading/db_vinAI_recoring_cleaned/wav-cleaned/tay-ninh/3624328/127-1.wav" |
|
|
|
|
|
model = MiniCPMO.from_pretrained( |
|
|
model_path, |
|
|
trust_remote_code=True, |
|
|
attn_implementation='sdpa', |
|
|
torch_dtype=torch.bfloat16, |
|
|
init_vision=False, |
|
|
init_audio=True, |
|
|
init_tts=False, |
|
|
processor_path="/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6", |
|
|
|
|
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
model = model.eval().cuda() |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
|
|
|
|
|
|
|
|
mimick_prompt = "Please transcribe this audio into text." |
|
|
audio_input, _ = librosa.load(audio_path, sr=16000, mono=True) |
|
|
system_message = """You are an assistant designed to help people answer questions. |
|
|
You have access to query the web using Bing Search. You should call bing search whenever a question requires up to date information or could benefit from web data. |
|
|
""" |
|
|
|
|
|
tools = [ |
|
|
{ |
|
|
"type": "function", |
|
|
"function": { |
|
|
"name": "search_bing", |
|
|
"description": "Tìm kiếm trên Bing để lấy thông tin cập nhật từ web", |
|
|
"parameters": { |
|
|
"type": "object", |
|
|
"properties": { |
|
|
"query": { |
|
|
"type": "string", |
|
|
"description": "Truy vấn tìm kiếm", |
|
|
} |
|
|
}, |
|
|
"required": ["query"], |
|
|
}, |
|
|
} |
|
|
} |
|
|
|
|
|
] |
|
|
msgs = [{'role': 'system', 'content': system_message}] |
|
|
|
|
|
|
|
|
msgs.append({'role': 'user', 'content': ["Chủ tịch nước hiện tại của Việt Nam là ai?"]}) |
|
|
|
|
|
|
|
|
available_functions = {'search_bing': search} |
|
|
|
|
|
result = run_multiturn_conversation(msgs, tokenizer, tools, available_functions) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Prediction: ") |
|
|
print(result) |
|
|
|
|
|
|