Spaces:

YongdongWang
/

DART-LLM-Multi-Model

Sleeping

File size: 10,463 Bytes

import gradio as gr
import spaces  # Import spaces module for ZeroGPU
from huggingface_hub import login
import os
from json_processor import JsonProcessor
import json

# 1) Read Secrets
hf_token = os.getenv("HUGGINGFACE_TOKEN")
if not hf_token:
    raise RuntimeError("❌ HUGGINGFACE_TOKEN not detected, please check Space Settings → Secrets")
# 2) Login to ensure all subsequent from_pretrained calls have proper permissions
login(hf_token)

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import warnings
import os
warnings.filterwarnings("ignore")

# Model configuration
MODEL_NAME = "meta-llama/Llama-3.1-8B"
LORA_MODEL = "YongdongWang/llama3.1-8b-lora-qlora-dart-llm"

# Global variables to store model and tokenizer
model = None
tokenizer = None
model_loaded = False

def load_model_and_tokenizer():
    """Load tokenizer - executed on CPU"""
    global tokenizer, model_loaded
    
    if model_loaded:
        return
    
    print("🔄 Loading tokenizer...")
    
    # Load tokenizer (on CPU)
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_NAME, 
        use_fast=False,
        trust_remote_code=True
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    model_loaded = True
    print("✅ Tokenizer loaded successfully!")

@spaces.GPU(duration=60)  # Request GPU for loading model at startup
def load_model_on_gpu():
    """Load model on GPU"""
    global model
    
    if model is not None:
        return model
    
    print("🔄 Loading model on GPU...")
    
    try:
        # 4-bit quantization configuration
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )
        
        # Load base model
        base_model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            quantization_config=bnb_config,
            device_map="auto",
            torch_dtype=torch.float16,
            trust_remote_code=True,
            low_cpu_mem_usage=True,
            use_safetensors=True
        )
        
        # Load LoRA adapter
        model = PeftModel.from_pretrained(
            base_model, 
            LORA_MODEL,
            torch_dtype=torch.float16,
            use_safetensors=True
        )
        model.eval()
        
        print("✅ Model loaded on GPU successfully!")
        return model
        
    except Exception as load_error:
        print(f"❌ Model loading failed: {load_error}")
        raise load_error

def process_json_in_response(response):
    """Process and format JSON content in the response"""
    try:
        # Check if response contains JSON-like content
        if '{' in response and '}' in response:
            processor = JsonProcessor()
            
            # Try to process the response for JSON content
            processed_json = processor.process_response(response)
            
            if processed_json:
                # Format the JSON nicely
                formatted_json = json.dumps(processed_json, indent=2, ensure_ascii=False)
                # Replace the JSON part in the response
                import re
                json_pattern = r'\{.*\}'
                match = re.search(json_pattern, response, re.DOTALL)
                if match:
                    # Replace the matched JSON with the formatted version
                    response = response.replace(match.group(), formatted_json)
            
        return response
    except Exception:
        # If processing fails, return original response
        return response

@spaces.GPU(duration=60)  # GPU inference
def generate_response_gpu(prompt, max_tokens=512):
    """Generate response - executed on GPU"""
    global model
    
    # Ensure tokenizer is loaded
    if tokenizer is None:
        load_model_and_tokenizer()
    
    # Ensure model is loaded on GPU
    if model is None:
        model = load_model_on_gpu()
    
    if model is None:
        return "❌ Model failed to load. Please check the Space logs."
    
    try:
        formatted_prompt = (
            "### Instruction:\n"
            f"{prompt.strip()}\n\n"
            "### Response:\n"
        )
        
        # Encode input
        inputs = tokenizer(
            formatted_prompt, 
            return_tensors="pt",
            truncation=True,
            max_length=2048
        ).to(model.device)
        
        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                do_sample=False,
                temperature=None,
                top_p=None,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.1,
                early_stopping=True,
                no_repeat_ngram_size=3
            )
        
        # Decode output
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract generated part
        if "### Response:" in response:
            response = response.split("### Response:")[-1].strip()
        elif len(response) > len(formatted_prompt):
            response = response[len(formatted_prompt):].strip()
        
        # Process JSON if present in response
        response = process_json_in_response(response)
        
        return response if response else "❌ No response generated. Please try again with a different prompt."
    
    except Exception as generation_error:
        return f"❌ Generation Error: {str(generation_error)}"

def chat_interface(message, history, max_tokens):
    """Chat interface - runs on CPU, calls GPU functions"""
    if not message.strip():
        return history, ""
    
    # Initialize tokenizer (if needed)
    if tokenizer is None:
        load_model_and_tokenizer()
    
    try:
        # Call GPU function to generate response
        response = generate_response_gpu(message, max_tokens)
        history.append((message, response))
        return history, ""
    except Exception as chat_error:
        error_msg = f"❌ Chat Error: {str(chat_error)}"
        history.append((message, error_msg))
        return history, ""

# Load tokenizer at startup
load_model_and_tokenizer()

# Create Gradio application
with gr.Blocks(
    title="Robot Task Planning - Llama 3.1 8B",
    theme=gr.themes.Soft(),
    css="""
    .gradio-container {
        max-width: 1200px;
        margin: auto;
    }
    """
) as app:
    gr.Markdown("""
    # 🤖 Llama 3.1 8B - Robot Task Planning
    
    This is a fine-tuned version of Meta's Llama 3.1 8B model specialized for **robot task planning** using QLoRA technique.
    
    **Capabilities**: Convert natural language robot commands into structured task sequences for excavators, dump trucks, and other construction robots.
    
    **Model**: [YongdongWang/llama3.1-8b-lora-qlora-dart-llm](https://huggingface.co/YongdongWang/llama3.1-8b-lora-qlora-dart-llm)
    
    ⚡ **Using ZeroGPU**: This Space uses dynamic GPU allocation (Nvidia H200). First generation might take a bit longer.
    """)
    
    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(
                label="Task Planning Results",
                height=500,
                show_label=True,
                container=True,
                bubble_full_width=False,
                show_copy_button=True
            )
            
            msg = gr.Textbox(
                label="Robot Command",
                placeholder="Enter robot task command (e.g., 'Deploy Excavator 1 to Soil Area 1')...",
                lines=2,
                max_lines=5,
                show_label=True,
                container=True
            )
            
            with gr.Row():
                send_btn = gr.Button("🚀 Generate Tasks", variant="primary", size="sm")
                clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="sm")
        
        with gr.Column(scale=1):
            gr.Markdown("### ⚙️ Generation Settings")
            
            max_tokens = gr.Slider(
                minimum=50,
                maximum=5000,
                value=512,
                step=10,
                label="Max Tokens",
                info="Maximum number of tokens to generate"
            )
            
            gr.Markdown("""
            ### 📊 Model Status
            - **Hardware**: ZeroGPU (Dynamic Nvidia H200)
            - **Status**: Ready
            - **Note**: First generation allocates GPU resources
            """)
    
    # Example conversations
    gr.Examples(
        examples=[
            "Dump truck 1 goes to the puddle for inspection, after which all robots avoid the puddle.",
            "Drive the Excavator 1 to the obstacle, and perform excavation to clear the obstacle.",
            "Send Excavator 1 and Dump Truck 1 to the soil area; Excavator 1 will excavate and unload, followed by Dump Truck 1 proceeding to the puddle for unloading.",
            "Move Excavator 1 and Dump Truck 1 to soil area 2; Excavator 1 will excavate and unload, then Dump Truck 1 returns to the starting position to unload.",
            "Excavator 1 is guided to the obstacle to excavate and unload to clear the obstacle, then excavator 1 and dump truck 1 are moved to the soil area, and the excavator excavates and unloads. Finally, dump truck 1 unloads the soil into the puddle.",
            "Excavator 1 goes to the obstacle to excavate and unload to clear the obstacle. Once the obstacle is cleared, mobilize all available robots to proceed to the puddle area for inspection.",
        ],
        inputs=msg,
        label="💡 Example Operator Commands"
    )
    
    # Event handling
    msg.submit(
        chat_interface,
        inputs=[msg, chatbot, max_tokens],
        outputs=[chatbot, msg]
    )
    
    send_btn.click(
        chat_interface,
        inputs=[msg, chatbot, max_tokens],
        outputs=[chatbot, msg]
    )
    
    clear_btn.click(
        lambda: ([], ""),
        outputs=[chatbot, msg]
    )

if __name__ == "__main__":
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        show_error=True
    )