File size: 6,922 Bytes

20e4eaa

import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer, AutoProcessor
import librosa
from omni_speech.model import *
import json
from langchain_community.tools import DuckDuckGoSearchRun, DuckDuckGoSearchResults
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
import re
from duckduckgo_search import DDGS

# load omni model default, the default init_vision/init_audio/init_tts is True
# if load vision-only model, please set init_audio=False and init_tts=False
# if load audio-only model, please set init_vision=False
ddgs_instance = DDGS()  # Initialize once

def search(query):
    # search = DuckDuckGoSearchResults(output_format="list")
    # search = DuckDuckGoSearchRun()
    query = query.lower()
    print("Query: ")
    print(query)
    # res = search.invoke(query)
    res = ddgs_instance.text(query, max_results=2)

    output = []

    for result in res:
        output.append({
            'title': result['title'],
            # 'link': result['href'],
            'snippet': result['body']
        })

    return json.dumps(output, ensure_ascii=False)

def run_multiturn_conversation(messages, tokenizer, functions, available_functions):
    # Step 1: send the conversation and available functions to GPT
    response = model.chat(
        msgs=messages,
        tokenizer=tokenizer,
        sampling=False,
        max_new_tokens=128,
        temperature=0.3,
        generate_audio=False,
        fc=functions
        # chunk_input=False,
        # processor=processor
    )

    # Step 2: check if GPT wanted to call a function
    if ("<tool_calls>" in response) or (response[0] == "{") or ("<response>" in response):
        response_message = response
        print("Recommended Function call:")
        print(response_message)

        if response[0] == "{":
            function = json.loads(response)
        else:
            response = re.sub(r"</?tool_call>", "", response).strip()
            response = re.sub(r"</?response>", "", response).strip()
            function = json.loads(response)

        print("Function: ")
        print(function)
        
        # Step 3: call the function
        # Note: the JSON response may not always be valid; be sure to handle errors
        
        function_name = function["name"]
        
        # verify function exists
        if function_name not in available_functions:
            return "Function " + function_name + " does not exist"
        function_to_call = available_functions[function_name]  
        
        function_args = function["arguments"]
        function_response = function_to_call(**function_args)
        
        print("Output of function call:")
        print(function_response)
        
        # Step 4: send the info on the function call and function response to GPT
        
        # adding assistant response to messages
        messages.append(
            {
                "role": "assistant",
                "function_call": [{
                    "type": "function",
                    "function": {"name": function_name, "arguments": function_args},
                }],
                "content": ""
            }
        )

        # adding function response to messages
        messages.append(
            {
                "role": "tool",
                "name": function_name,
                "content": function_response,
            }
        )  # extend conversation with function response

        print("Messages in next request:")
        for message in messages:
            print(message)

        response = model.chat(
            msgs=messages,
            tokenizer=tokenizer,
            sampling=False,
            max_new_tokens=128,
            temperature=0.3,
            generate_audio=False,
            fc=functions
            # chunk_input=False,
            # processor=processor
        )

    return response

model_path = "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/minicpmo_sft_vi_fc"
audio_path = "/data1/speech/speechData/data_Vi/audio_Vi/reading/db_vinAI_recoring_cleaned/wav-cleaned/tay-ninh/3624328/127-1.wav"
# minipcmo_config = json.load(open("/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/scripts/minicpmp_config.json", "r"))
model = MiniCPMO.from_pretrained(
    model_path,
    trust_remote_code=True,
    attn_implementation='sdpa', # sdpa or flash_attention_2
    torch_dtype=torch.bfloat16,
    init_vision=False,
    init_audio=True,
    init_tts=False,
    processor_path="/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6",
    # pretrained_encoder_path="/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/speech_encoder/whisper-medium",
    # chunk_input=False,
    # **minipcmo_config
)


model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# processor = AutoProcessor.from_pretrained("/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6", trust_remote_code=True)

mimick_prompt = "Please transcribe this audio into text."
audio_input, _ = librosa.load(audio_path, sr=16000, mono=True)
system_message = """You are an assistant designed to help people answer questions.
You have access to query the web using Bing Search. You should call bing search whenever a question requires up to date information or could benefit from web data.
"""
# system_message = "You are a helpful language and speech assistant. You are able to understand the speech content that the user provides, and assist the user with a variety of tasks using natural language."
tools = [  
    {
        "type": "function",
        "function": {
            "name": "search_bing",
            "description": "Tìm kiếm trên Bing để lấy thông tin cập nhật từ web",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "Truy vấn tìm kiếm",
                    }
                },
                "required": ["query"],
            },
        }
    }
    
]
msgs = [{'role': 'system', 'content': system_message}]
# msgs.append({'role': 'user', 'content': [audio_input, mimick_prompt]})
# msgs.append({'role': 'user', 'content': [audio_input]})
msgs.append({'role': 'user', 'content': ["Chủ tịch nước hiện tại của Việt Nam là ai?"]})


available_functions = {'search_bing': search}

result = run_multiturn_conversation(msgs, tokenizer, tools, available_functions)

# result = model.chat(
#     msgs=msgs,
#     tokenizer=tokenizer,
#     sampling=False,
#     max_new_tokens=128,
#     temperature=0.3,
#     generate_audio=False,
#     fc=tools
#     # chunk_input=False,
#     # processor=processor
# )

print("Prediction: ")
print(result)