import torch from PIL import Image from transformers import AutoModel, AutoTokenizer, AutoProcessor import librosa from omni_speech.model import * import json from langchain_community.tools import DuckDuckGoSearchRun, DuckDuckGoSearchResults from langchain_community.utilities import DuckDuckGoSearchAPIWrapper import re from duckduckgo_search import DDGS # load omni model default, the default init_vision/init_audio/init_tts is True # if load vision-only model, please set init_audio=False and init_tts=False # if load audio-only model, please set init_vision=False ddgs_instance = DDGS() # Initialize once def search(query): # search = DuckDuckGoSearchResults(output_format="list") # search = DuckDuckGoSearchRun() query = query.lower() print("Query: ") print(query) # res = search.invoke(query) res = ddgs_instance.text(query, max_results=2) output = [] for result in res: output.append({ 'title': result['title'], # 'link': result['href'], 'snippet': result['body'] }) return json.dumps(output, ensure_ascii=False) def run_multiturn_conversation(messages, tokenizer, functions, available_functions): # Step 1: send the conversation and available functions to GPT response = model.chat( msgs=messages, tokenizer=tokenizer, sampling=False, max_new_tokens=128, temperature=0.3, generate_audio=False, fc=functions # chunk_input=False, # processor=processor ) # Step 2: check if GPT wanted to call a function if ("" in response) or (response[0] == "{") or ("" in response): response_message = response print("Recommended Function call:") print(response_message) if response[0] == "{": function = json.loads(response) else: response = re.sub(r"", "", response).strip() response = re.sub(r"", "", response).strip() function = json.loads(response) print("Function: ") print(function) # Step 3: call the function # Note: the JSON response may not always be valid; be sure to handle errors function_name = function["name"] # verify function exists if function_name not in available_functions: return "Function " + function_name + " does not exist" function_to_call = available_functions[function_name] function_args = function["arguments"] function_response = function_to_call(**function_args) print("Output of function call:") print(function_response) # Step 4: send the info on the function call and function response to GPT # adding assistant response to messages messages.append( { "role": "assistant", "function_call": [{ "type": "function", "function": {"name": function_name, "arguments": function_args}, }], "content": "" } ) # adding function response to messages messages.append( { "role": "tool", "name": function_name, "content": function_response, } ) # extend conversation with function response print("Messages in next request:") for message in messages: print(message) response = model.chat( msgs=messages, tokenizer=tokenizer, sampling=False, max_new_tokens=128, temperature=0.3, generate_audio=False, fc=functions # chunk_input=False, # processor=processor ) return response model_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/minicpmo_sft_vi_fc" audio_path = "/data1/speech/speechData/data_Vi/audio_Vi/reading/db_vinAI_recoring_cleaned/wav-cleaned/tay-ninh/3624328/127-1.wav" # minipcmo_config = json.load(open("/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/scripts/minicpmp_config.json", "r")) model = MiniCPMO.from_pretrained( model_path, trust_remote_code=True, attn_implementation='sdpa', # sdpa or flash_attention_2 torch_dtype=torch.bfloat16, init_vision=False, init_audio=True, init_tts=False, processor_path="/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6", # pretrained_encoder_path="/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/speech_encoder/whisper-medium", # chunk_input=False, # **minipcmo_config ) model = model.eval().cuda() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) # processor = AutoProcessor.from_pretrained("/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6", trust_remote_code=True) mimick_prompt = "Please transcribe this audio into text." audio_input, _ = librosa.load(audio_path, sr=16000, mono=True) system_message = """You are an assistant designed to help people answer questions. You have access to query the web using Bing Search. You should call bing search whenever a question requires up to date information or could benefit from web data. """ # system_message = "You are a helpful language and speech assistant. You are able to understand the speech content that the user provides, and assist the user with a variety of tasks using natural language." tools = [ { "type": "function", "function": { "name": "search_bing", "description": "Tìm kiếm trên Bing để lấy thông tin cập nhật từ web", "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "Truy vấn tìm kiếm", } }, "required": ["query"], }, } } ] msgs = [{'role': 'system', 'content': system_message}] # msgs.append({'role': 'user', 'content': [audio_input, mimick_prompt]}) # msgs.append({'role': 'user', 'content': [audio_input]}) msgs.append({'role': 'user', 'content': ["Chủ tịch nước hiện tại của Việt Nam là ai?"]}) available_functions = {'search_bing': search} result = run_multiturn_conversation(msgs, tokenizer, tools, available_functions) # result = model.chat( # msgs=msgs, # tokenizer=tokenizer, # sampling=False, # max_new_tokens=128, # temperature=0.3, # generate_audio=False, # fc=tools # # chunk_input=False, # # processor=processor # ) print("Prediction: ") print(result)