Spaces:

YongdongWang
/

DART-LLM-Multi-Model

Sleeping

App Files Files Community

Yongdong Wang commited on about 1 month ago

Commit

792bd1c

1 Parent(s): 83496e3

Increase multi-model support.

Browse files

Files changed (1) hide show

app.py +78 -31

app.py CHANGED Viewed

@@ -19,45 +19,74 @@ import warnings
 import os
 warnings.filterwarnings("ignore")
-# Model configuration
-MODEL_NAME = "meta-llama/Llama-3.1-8B"
-LORA_MODEL = "YongdongWang/llama3.1-8b-lora-qlora-dart-llm"
 # Global variables to store model and tokenizer
 model = None
 tokenizer = None
 model_loaded = False
-def load_model_and_tokenizer():
     """Load tokenizer - executed on CPU"""
-    global tokenizer, model_loaded
-    if model_loaded:
         return
-    print("🔄 Loading tokenizer...")
     # Load tokenizer (on CPU)
     tokenizer = AutoTokenizer.from_pretrained(
-        MODEL_NAME,
         use_fast=False,
         trust_remote_code=True
     )
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     model_loaded = True
     print("✅ Tokenizer loaded successfully!")
 @spaces.GPU(duration=60)  # Request GPU for loading model at startup
-def load_model_on_gpu():
     """Load model on GPU"""
     global model
-    if model is not None:
         return model
-    print("🔄 Loading model on GPU...")
     try:
         # 4-bit quantization configuration
@@ -70,7 +99,7 @@ def load_model_on_gpu():
         # Load base model
         base_model = AutoModelForCausalLM.from_pretrained(
-            MODEL_NAME,
             quantization_config=bnb_config,
             device_map="auto",
             torch_dtype=torch.float16,
@@ -82,13 +111,13 @@ def load_model_on_gpu():
         # Load LoRA adapter
         model = PeftModel.from_pretrained(
             base_model,
-            LORA_MODEL,
             torch_dtype=torch.float16,
             use_safetensors=True
         )
         model.eval()
-        print("✅ Model loaded on GPU successfully!")
         return model
     except Exception as load_error:
@@ -122,17 +151,17 @@ def process_json_in_response(response):
         return response
 @spaces.GPU(duration=60)  # GPU inference
-def generate_response_gpu(prompt, max_tokens=512):
     """Generate response - executed on GPU"""
     global model
     # Ensure tokenizer is loaded
-    if tokenizer is None:
-        load_model_and_tokenizer()
     # Ensure model is loaded on GPU
-    if model is None:
-        model = load_model_on_gpu()
     if model is None:
         return "❌ Model failed to load. Please check the Space logs."
@@ -184,18 +213,18 @@ def generate_response_gpu(prompt, max_tokens=512):
     except Exception as generation_error:
         return f"❌ Generation Error: {str(generation_error)}"
-def chat_interface(message, history, max_tokens):
     """Chat interface - runs on CPU, calls GPU functions"""
     if not message.strip():
         return history, ""
     # Initialize tokenizer (if needed)
-    if tokenizer is None:
-        load_model_and_tokenizer()
     try:
         # Call GPU function to generate response
-        response = generate_response_gpu(message, max_tokens)
         history.append((message, response))
         return history, ""
     except Exception as chat_error:
@@ -203,12 +232,12 @@ def chat_interface(message, history, max_tokens):
         history.append((message, error_msg))
         return history, ""
-# Load tokenizer at startup
-load_model_and_tokenizer()
 # Create Gradio application
 with gr.Blocks(
-    title="Robot Task Planning - Llama 3.1 8B",
     theme=gr.themes.Soft(),
     css="""
     .gradio-container {
@@ -218,13 +247,20 @@ with gr.Blocks(
     """
 ) as app:
     gr.Markdown("""
-    # 🤖 Llama 3.1 8B - Robot Task Planning
-    This is a fine-tuned version of Meta's Llama 3.1 8B model specialized for **robot task planning** using QLoRA technique.
     **Capabilities**: Convert natural language robot commands into structured task sequences for excavators, dump trucks, and other construction robots.
-    **Model**: [YongdongWang/llama3.1-8b-lora-qlora-dart-llm](https://huggingface.co/YongdongWang/llama3.1-8b-lora-qlora-dart-llm)
     ⚡ **Using ZeroGPU**: This Space uses dynamic GPU allocation (Nvidia H200). First generation might take a bit longer.
     """)
@@ -256,6 +292,14 @@ with gr.Blocks(
         with gr.Column(scale=1):
             gr.Markdown("### ⚙️ Generation Settings")
             max_tokens = gr.Slider(
                 minimum=50,
                 maximum=5000,
@@ -270,6 +314,9 @@ with gr.Blocks(
             - **Hardware**: ZeroGPU (Dynamic Nvidia H200)
             - **Status**: Ready
             - **Note**: First generation allocates GPU resources
             """)
     # Example conversations
@@ -289,13 +336,13 @@ with gr.Blocks(
     # Event handling
     msg.submit(
         chat_interface,
-        inputs=[msg, chatbot, max_tokens],
         outputs=[chatbot, msg]
     )
     send_btn.click(
         chat_interface,
-        inputs=[msg, chatbot, max_tokens],
         outputs=[chatbot, msg]
     )

 import os
 warnings.filterwarnings("ignore")
+# Model configurations
+MODEL_CONFIGS = {
+    "1B": {
+        "name": "Dart-llm-model-1B",
+        "base_model": "meta-llama/Llama-3.2-1B",
+        "lora_model": "YongdongWang/llama-3.2-1b-lora-qlora-dart-llm"
+    },
+    "3B": {
+        "name": "Dart-llm-model-3B",
+        "base_model": "meta-llama/Llama-3.2-3B",
+        "lora_model": "YongdongWang/llama-3.2-3b-lora-qlora-dart-llm"
+    },
+    "8B": {
+        "name": "Dart-llm-model-8B",
+        "base_model": "meta-llama/Llama-3.1-8B",
+        "lora_model": "YongdongWang/llama-3.1-8b-lora-qlora-dart-llm"
+    }
+}
+DEFAULT_MODEL = "1B"  # Set 1B as default
 # Global variables to store model and tokenizer
 model = None
 tokenizer = None
+current_model_config = None
 model_loaded = False
+def load_model_and_tokenizer(selected_model=DEFAULT_MODEL):
     """Load tokenizer - executed on CPU"""
+    global tokenizer, model_loaded, current_model_config
+    if model_loaded and current_model_config == selected_model:
         return
+    print(f"🔄 Loading tokenizer for {MODEL_CONFIGS[selected_model]['name']}...")
     # Load tokenizer (on CPU)
+    base_model = MODEL_CONFIGS[selected_model]["base_model"]
     tokenizer = AutoTokenizer.from_pretrained(
+        base_model,
         use_fast=False,
         trust_remote_code=True
     )
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
+    current_model_config = selected_model
     model_loaded = True
     print("✅ Tokenizer loaded successfully!")
 @spaces.GPU(duration=60)  # Request GPU for loading model at startup
+def load_model_on_gpu(selected_model=DEFAULT_MODEL):
     """Load model on GPU"""
     global model
+    # If model is already loaded and it's the same model, return it
+    if model is not None and current_model_config == selected_model:
         return model
+    # Clear existing model if switching
+    if model is not None:
+        print("🗑️ Clearing existing model from GPU...")
+        del model
+        torch.cuda.empty_cache()
+        model = None
+    model_config = MODEL_CONFIGS[selected_model]
+    print(f"🔄 Loading {model_config['name']} on GPU...")
     try:
         # 4-bit quantization configuration
         # Load base model
         base_model = AutoModelForCausalLM.from_pretrained(
+            model_config["base_model"],
             quantization_config=bnb_config,
             device_map="auto",
             torch_dtype=torch.float16,
         # Load LoRA adapter
         model = PeftModel.from_pretrained(
             base_model,
+            model_config["lora_model"],
             torch_dtype=torch.float16,
             use_safetensors=True
         )
         model.eval()
+        print(f"✅ {model_config['name']} loaded on GPU successfully!")
         return model
     except Exception as load_error:
         return response
 @spaces.GPU(duration=60)  # GPU inference
+def generate_response_gpu(prompt, max_tokens=512, selected_model=DEFAULT_MODEL):
     """Generate response - executed on GPU"""
     global model
     # Ensure tokenizer is loaded
+    if tokenizer is None or current_model_config != selected_model:
+        load_model_and_tokenizer(selected_model)
     # Ensure model is loaded on GPU
+    if model is None or current_model_config != selected_model:
+        model = load_model_on_gpu(selected_model)
     if model is None:
         return "❌ Model failed to load. Please check the Space logs."
     except Exception as generation_error:
         return f"❌ Generation Error: {str(generation_error)}"
+def chat_interface(message, history, max_tokens, selected_model):
     """Chat interface - runs on CPU, calls GPU functions"""
     if not message.strip():
         return history, ""
     # Initialize tokenizer (if needed)
+    if tokenizer is None or current_model_config != selected_model:
+        load_model_and_tokenizer(selected_model)
     try:
         # Call GPU function to generate response
+        response = generate_response_gpu(message, max_tokens, selected_model)
         history.append((message, response))
         return history, ""
     except Exception as chat_error:
         history.append((message, error_msg))
         return history, ""
+# Load tokenizer at startup with default model
+load_model_and_tokenizer(DEFAULT_MODEL)
 # Create Gradio application
 with gr.Blocks(
+    title="Robot Task Planning - DART-LLM Multi-Model",
     theme=gr.themes.Soft(),
     css="""
     .gradio-container {
     """
 ) as app:
     gr.Markdown("""
+    # 🤖 DART-LLM Multi-Model - Robot Task Planning
+    Choose from **three fine-tuned models** specialized for **robot task planning** using QLoRA technique:
+    - **🚀 Dart-llm-model-1B** (Default): Fastest inference, optimized for speed
+    - **⚖️ Dart-llm-model-3B**: Balanced performance and quality
+    - **🎯 Dart-llm-model-8B**: Best quality output, higher latency
     **Capabilities**: Convert natural language robot commands into structured task sequences for excavators, dump trucks, and other construction robots.
+    **Models**:
+    - [YongdongWang/llama-3.2-1b-lora-qlora-dart-llm](https://huggingface.co/YongdongWang/llama-3.2-1b-lora-qlora-dart-llm) (Default)
+    - [YongdongWang/llama-3.2-3b-lora-qlora-dart-llm](https://huggingface.co/YongdongWang/llama-3.2-3b-lora-qlora-dart-llm)
+    - [YongdongWang/llama-3.1-8b-lora-qlora-dart-llm](https://huggingface.co/YongdongWang/llama-3.1-8b-lora-qlora-dart-llm)
     ⚡ **Using ZeroGPU**: This Space uses dynamic GPU allocation (Nvidia H200). First generation might take a bit longer.
     """)
         with gr.Column(scale=1):
             gr.Markdown("### ⚙️ Generation Settings")
+            model_selector = gr.Dropdown(
+                choices=[(config["name"], key) for key, config in MODEL_CONFIGS.items()],
+                value=DEFAULT_MODEL,
+                label="Model Size",
+                info="Select model size (1B = fastest, 8B = best quality)",
+                interactive=True
+            )
             max_tokens = gr.Slider(
                 minimum=50,
                 maximum=5000,
             - **Hardware**: ZeroGPU (Dynamic Nvidia H200)
             - **Status**: Ready
             - **Note**: First generation allocates GPU resources
+            - **Dart-llm-model-1B**: Fastest inference (Default)
+            - **Dart-llm-model-3B**: Balanced speed/quality
+            - **Dart-llm-model-8B**: Best quality, slower
             """)
     # Example conversations
     # Event handling
     msg.submit(
         chat_interface,
+        inputs=[msg, chatbot, max_tokens, model_selector],
         outputs=[chatbot, msg]
     )
     send_btn.click(
         chat_interface,
+        inputs=[msg, chatbot, max_tokens, model_selector],
         outputs=[chatbot, msg]
     )