Spaces:

jlov7
/

Dynamic-Function-Calling-Agent

Running

File size: 6,269 Bytes

6639f75

"""
Robustness Testing for Dynamic Function-Calling Agent

Tests model stability with:
1. Shuffled JSON key order
2. Distractor text before schema
3. Noisy prompts

Quick test that doesn't require retraining.
"""

import json
import random
from test_constrained_model import load_trained_model, constrained_json_generate, create_json_schema

def shuffle_json_keys(obj):
    """Recursively shuffle the order of keys in JSON objects"""
    if isinstance(obj, dict):
        items = list(obj.items())
        random.shuffle(items)
        return {k: shuffle_json_keys(v) for k, v in items}
    elif isinstance(obj, list):
        return [shuffle_json_keys(item) for item in obj]
    return obj

def add_distractor_text(schema_str):
    """Add distracting text before the schema"""
    distractors = [
        "Note: This is a complex API with many parameters.",
        "Important: Please review all requirements carefully.",
        "Warning: Some fields may be optional depending on context.",
        "Info: This function supports multiple data formats.",
        "Reminder: Check authentication before making calls."
    ]
    distractor = random.choice(distractors)
    return f"{distractor}\n\n{schema_str}"

def test_robustness():
    """Run robustness tests on the function calling agent"""
    print("🧪 Starting Robustness Tests...")
    
    # Load model
    model, tokenizer = load_trained_model()
    
    # Test schema
    base_schema = {
        "name": "get_weather_forecast",
        "description": "Get weather forecast for a location",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {"type": "string", "description": "City name"},
                "days": {"type": "integer", "description": "Number of days", "minimum": 1},
                "units": {"type": "string", "enum": ["metric", "imperial"]},
                "include_hourly": {"type": "boolean", "default": False}
            },
            "required": ["location", "days"]
        }
    }
    
    test_queries = [
        "Get 3-day weather for Paris",
        "Weather forecast for Tokyo, 5 days, metric units",
        "I need the weather for London for the next week"
    ]
    
    results = {
        "baseline": [],
        "shuffled_keys": [],
        "with_distractors": [],
        "both_shuffled_and_distractors": []
    }
    
    print("\n🔍 Running test scenarios...")
    
    for query in test_queries:
        print(f"\n📝 Query: '{query}'")
        
        # 1. Baseline test
        schema = create_json_schema(base_schema)
        prompt = f"""<|im_start|>system
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|>

<schema>
{json.dumps(base_schema, indent=2)}
</schema>

<|im_start|>user
{query}<|im_end|>
<|im_start|>assistant
"""
        
        response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
        results["baseline"].append(success)
        print(f"  ✅ Baseline: {'✓' if success else '✗'}")
        
        # 2. Shuffled keys test
        shuffled_schema = shuffle_json_keys(base_schema)
        schema = create_json_schema(shuffled_schema)
        prompt = f"""<|im_start|>system
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|>

<schema>
{json.dumps(shuffled_schema, indent=2)}
</schema>

<|im_start|>user
{query}<|im_end|>
<|im_start|>assistant
"""
        
        response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
        results["shuffled_keys"].append(success)
        print(f"  🔀 Shuffled: {'✓' if success else '✗'}")
        
        # 3. Distractor text test
        schema = create_json_schema(base_schema)
        schema_with_distractor = add_distractor_text(json.dumps(base_schema, indent=2))
        prompt = f"""<|im_start|>system
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|>

<schema>
{schema_with_distractor}
</schema>

<|im_start|>user
{query}<|im_end|>
<|im_start|>assistant
"""
        
        response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
        results["with_distractors"].append(success)
        print(f"  🎭 Distractor: {'✓' if success else '✗'}")
        
        # 4. Both shuffled and distractors
        shuffled_schema = shuffle_json_keys(base_schema)
        schema = create_json_schema(shuffled_schema)
        schema_with_distractor = add_distractor_text(json.dumps(shuffled_schema, indent=2))
        prompt = f"""<|im_start|>system
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|>

<schema>
{schema_with_distractor}
</schema>

<|im_start|>user
{query}<|im_end|>
<|im_start|>assistant
"""
        
        response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
        results["both_shuffled_and_distractors"].append(success)
        print(f"  🔀🎭 Both: {'✓' if success else '✗'}")
    
    # Calculate success rates
    print("\n📊 Robustness Test Results:")
    print("=" * 50)
    
    for test_name, test_results in results.items():
        success_rate = (sum(test_results) / len(test_results)) * 100
        print(f"{test_name.replace('_', ' ').title()}: {success_rate:.1f}% ({sum(test_results)}/{len(test_results)})")
    
    print("\n🎯 Analysis:")
    baseline_rate = (sum(results["baseline"]) / len(results["baseline"])) * 100
    
    for test_name, test_results in results.items():
        if test_name != "baseline":
            test_rate = (sum(test_results) / len(test_results)) * 100
            diff = test_rate - baseline_rate
            status = "🟢" if diff >= -10 else "🟡" if diff >= -20 else "🔴"
            print(f"{status} {test_name.replace('_', ' ').title()}: {diff:+.1f}% vs baseline")
    
    return results

if __name__ == "__main__":
    test_robustness()