import gradio as gr from loguru import logger from gradio_llm_interface import GradioLlmInterface from config import GRADIO_MESSAGE_MODES, MODE_CONFIG import openai import os from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() # Speech-to-text function using OpenAI Whisper def audio_to_text(audio): if audio is None: return "No audio file provided." try: # Get OpenAI API key from environment variable openai_api_key = os.getenv("OPENAI_API_KEY") if not openai_api_key: return "Error: OpenAI API key not found. Please set OPENAI_API_KEY environment variable." # Initialize OpenAI client client = openai.OpenAI(api_key=openai_api_key) # Open and transcribe the audio file with open(audio, "rb") as audio_file: transcript = client.audio.transcriptions.create( model="whisper-1", file=audio_file ) return transcript.text except FileNotFoundError: return "Error: Audio file not found." except openai.AuthenticationError: return "Error: Invalid OpenAI API key." except openai.RateLimitError: return "Error: OpenAI API rate limit exceeded." except Exception as e: logger.error(f"Speech-to-text error: {str(e)}") return f"Error during speech recognition: {str(e)}" def main(): gradio_ros_interface = GradioLlmInterface() title_markdown = (""" # 🌋 DART-LLM: Dependency-Aware Multi-Robot Task Decomposition and Execution using Large Language Models [[Project Page](https://wyd0817.github.io/project-dart-llm/)] [[Code](https://github.com/wyd0817/gradio_gpt_interface)] [[Model](https://artificialanalysis.ai/)] | 📚 [[RoboQA](https://www.overleaf.com/project/6614a987ae2994cae02efcb2)] """) with gr.Blocks(css=""" #text-input, #audio-input { height: 100px; /* Unified height */ max-height: 100px; width: 100%; /* Full container width */ margin: 0; } .input-container { display: flex; /* Flex layout */ gap: 10px; /* Spacing */ align-items: center; /* Vertical alignment */ } #voice-input-container { display: flex; align-items: center; gap: 15px; margin: 15px 0; padding: 15px; background: linear-gradient(135deg, #ffeef8 0%, #fff5f5 100%); border-radius: 20px; border: 1px solid #ffe4e6; } #voice-btn { width: 50px !important; height: 50px !important; border-radius: 50% !important; font-size: 20px !important; background: linear-gradient(135deg, #ff6b9d 0%, #c44569 100%) !important; color: white !important; border: none !important; box-shadow: 0 4px 15px rgba(255, 107, 157, 0.3) !important; transition: all 0.3s ease !important; } #voice-btn:hover { transform: scale(1.05) !important; box-shadow: 0 6px 20px rgba(255, 107, 157, 0.4) !important; } #voice-btn:active { transform: scale(0.95) !important; } .voice-recording { background: linear-gradient(135deg, #ff4757 0%, #ff3742 100%) !important; animation: pulse 1.5s infinite !important; } @keyframes pulse { 0% { box-shadow: 0 4px 15px rgba(255, 71, 87, 0.3); } 50% { box-shadow: 0 4px 25px rgba(255, 71, 87, 0.6); } 100% { box-shadow: 0 4px 15px rgba(255, 71, 87, 0.3); } } #voice-status { color: #ff6b9d; font-size: 14px; font-weight: 500; text-align: center; margin-top: 10px; } /* Enhanced layout for left-right split */ .gradio-container .gradio-row { gap: 20px; /* Add spacing between columns */ } .gradio-column { padding: 10px; border-radius: 8px; background-color: var(--panel-background-fill); } /* Chat interface styling */ .chat-column { border: 1px solid var(--border-color-primary); } /* DAG visualization column styling */ .dag-column { border: 1px solid var(--border-color-primary); } """) as demo: gr.Markdown(title_markdown) mode_choices = [MODE_CONFIG[mode]["display_name"] for mode in GRADIO_MESSAGE_MODES] mode_selector = gr.Radio(choices=mode_choices, label="Backend model", value=mode_choices[0]) clear_button = gr.Button("Clear Chat") logger.info("Starting Gradio GPT Interface...") initial_mode = GRADIO_MESSAGE_MODES[0] def update_mode(selected_mode, state): mode_key = [key for key, value in MODE_CONFIG.items() if value["display_name"] == selected_mode][0] return gradio_ros_interface.update_chatbot(mode_key, state) # Main content area with left-right layout with gr.Row(): # Left column: Chat interface with gr.Column(scale=1, elem_classes=["chat-column"]): gr.Markdown("### 🤖 DART-LLM Chat Interface") # Create chatbot component in the left column chatbot_container = gr.Chatbot(label="DART-LLM", type="messages") # Initialize the interface and get state data state_data = gradio_ros_interface.initialize_interface(initial_mode) state = gr.State(state_data) # Add input area in the left column with gr.Row(elem_id="input-container"): txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter", elem_id="text-input", container=False) with gr.Row(elem_id="voice-input-container"): with gr.Column(scale=4): # Hidden audio component audio_input = gr.Audio( sources=["microphone"], type="filepath", elem_id="audio-input", show_label=False, interactive=True, streaming=False, visible=False ) # Voice input status display voice_status = gr.Markdown("", elem_id="voice-status", visible=False) with gr.Column(scale=1, min_width=80): # Main voice button voice_btn = gr.Button( "🎙️", elem_id="voice-btn", variant="secondary", size="sm", scale=1 ) # Example prompts in the left column gr.Examples( examples=[ "Dump truck 1 goes to the puddle for inspection, after which all robots avoid the puddle", "Send Excavator 1 and Dump Truck 1 to the soil area; Excavator 1 will excavate and unload, followed by Dump Truck 1 proceeding to the puddle for unloading." ], inputs=txt ) # Right column: DAG visualization and controls with gr.Column(scale=1, elem_classes=["dag-column"]): gr.Markdown("### 📊 Task Dependency Visualization") # DAG visualization display dag_image = gr.Image(label="Task Dependency Graph", visible=True, height=600) # Task plan editing section task_editor = gr.Code( label="Task Plan JSON Editor", language="json", visible=False, lines=15, interactive=True ) # Control buttons section with gr.Row(): with gr.Column(scale=2): deployment_status = gr.Markdown("", visible=True) with gr.Column(scale=1): with gr.Row(): edit_task_btn = gr.Button( "📝 Edit Task Plan", variant="secondary", visible=False, size="sm" ) update_dag_btn = gr.Button( "🔄 Update DAG Visualization", variant="secondary", visible=False, size="sm" ) validate_deploy_btn = gr.Button( "🔒 Validate & Deploy Task Plan", variant="primary", visible=False, size="sm" ) mode_selector.change(update_mode, inputs=[mode_selector, state], outputs=[chatbot_container, state]) clear_button.click(gradio_ros_interface.clear_chat, inputs=[state], outputs=[chatbot_container]) # Handle text input submission async def handle_text_submit(text, state): messages, state, dag_image_path, validate_btn_update = await gradio_ros_interface.predict(text, state) # Show edit button when task plan is generated edit_btn_visible = validate_btn_update.get('visible', False) return ( "", # Clear the text input after submission messages, state, dag_image_path, validate_btn_update, gr.update(visible=edit_btn_visible) # Show edit button ) txt.submit(handle_text_submit, [txt, state], [txt, chatbot_container, state, dag_image, validate_deploy_btn, edit_task_btn]) # Voice input state management voice_recording = gr.State(False) # Voice button click handler def handle_voice_input(audio, is_recording): logger.info(f"Voice button clicked, current recording state: {is_recording}") if not is_recording: # Start recording state logger.info("Starting recording...") return ( gr.update(value="🔴", elem_classes=["voice-recording"]), # Change button style "💬 Recording in progress...", # Status message gr.update(visible=True), # Show status gr.update(visible=True), # Show audio component True, # Update recording state "" # Clear text box ) else: # Stop recording and transcribe logger.info("Stopping recording, starting transcription...") if audio is not None and audio != "": try: text = audio_to_text(audio) logger.info(f"Transcription completed: {text}") return ( gr.update(value="🎙️", elem_classes=[]), # Restore button style "✨ Transcription completed!", # Success message gr.update(visible=True), # Show status gr.update(visible=False), # Hide audio component False, # Reset recording state text # Fill in transcribed text ) except Exception as e: logger.error(f"Transcription error: {e}") return ( gr.update(value="🎙️", elem_classes=[]), # Restore button style f"❌ Transcription failed: {str(e)}", gr.update(visible=True), gr.update(visible=False), False, "" ) else: logger.warning("No audio detected") return ( gr.update(value="🎙️", elem_classes=[]), # Restore button style "⚠️ No audio detected, please record again", gr.update(visible=True), gr.update(visible=False), False, "" ) # Voice button event handling voice_btn.click( handle_voice_input, inputs=[audio_input, voice_recording], outputs=[voice_btn, voice_status, voice_status, audio_input, voice_recording, txt] ) # Audio state change listener - automatic prompt def on_audio_change(audio): if audio is not None: logger.info("Audio file detected") return "🎵 Audio detected, you can click the button to complete transcription" return "" audio_input.change( on_audio_change, inputs=[audio_input], outputs=[voice_status] ) # Handle task plan editing edit_task_btn.click( gradio_ros_interface.show_task_plan_editor, inputs=[state], outputs=[task_editor, update_dag_btn, validate_deploy_btn, deployment_status] ) # Handle DAG update from editor update_dag_btn.click( gradio_ros_interface.update_dag_from_editor, inputs=[task_editor, state], outputs=[dag_image, validate_deploy_btn, task_editor, update_dag_btn, deployment_status, state] ) # Handle validation and deployment validate_deploy_btn.click( gradio_ros_interface.validate_and_deploy_task_plan, inputs=[state], outputs=[deployment_status, dag_image, validate_deploy_btn, state] ) demo.launch(server_port=8080, share=True) if __name__ == "__main__": main()