File size: 6,269 Bytes
6639f75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
"""
Robustness Testing for Dynamic Function-Calling Agent

Tests model stability with:
1. Shuffled JSON key order
2. Distractor text before schema
3. Noisy prompts

Quick test that doesn't require retraining.
"""

import json
import random
from test_constrained_model import load_trained_model, constrained_json_generate, create_json_schema

def shuffle_json_keys(obj):
    """Recursively shuffle the order of keys in JSON objects"""
    if isinstance(obj, dict):
        items = list(obj.items())
        random.shuffle(items)
        return {k: shuffle_json_keys(v) for k, v in items}
    elif isinstance(obj, list):
        return [shuffle_json_keys(item) for item in obj]
    return obj

def add_distractor_text(schema_str):
    """Add distracting text before the schema"""
    distractors = [
        "Note: This is a complex API with many parameters.",
        "Important: Please review all requirements carefully.",
        "Warning: Some fields may be optional depending on context.",
        "Info: This function supports multiple data formats.",
        "Reminder: Check authentication before making calls."
    ]
    distractor = random.choice(distractors)
    return f"{distractor}\n\n{schema_str}"

def test_robustness():
    """Run robustness tests on the function calling agent"""
    print("πŸ§ͺ Starting Robustness Tests...")
    
    # Load model
    model, tokenizer = load_trained_model()
    
    # Test schema
    base_schema = {
        "name": "get_weather_forecast",
        "description": "Get weather forecast for a location",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {"type": "string", "description": "City name"},
                "days": {"type": "integer", "description": "Number of days", "minimum": 1},
                "units": {"type": "string", "enum": ["metric", "imperial"]},
                "include_hourly": {"type": "boolean", "default": False}
            },
            "required": ["location", "days"]
        }
    }
    
    test_queries = [
        "Get 3-day weather for Paris",
        "Weather forecast for Tokyo, 5 days, metric units",
        "I need the weather for London for the next week"
    ]
    
    results = {
        "baseline": [],
        "shuffled_keys": [],
        "with_distractors": [],
        "both_shuffled_and_distractors": []
    }
    
    print("\nπŸ” Running test scenarios...")
    
    for query in test_queries:
        print(f"\nπŸ“ Query: '{query}'")
        
        # 1. Baseline test
        schema = create_json_schema(base_schema)
        prompt = f"""<|im_start|>system
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|>

<schema>
{json.dumps(base_schema, indent=2)}
</schema>

<|im_start|>user
{query}<|im_end|>
<|im_start|>assistant
"""
        
        response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
        results["baseline"].append(success)
        print(f"  βœ… Baseline: {'βœ“' if success else 'βœ—'}")
        
        # 2. Shuffled keys test
        shuffled_schema = shuffle_json_keys(base_schema)
        schema = create_json_schema(shuffled_schema)
        prompt = f"""<|im_start|>system
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|>

<schema>
{json.dumps(shuffled_schema, indent=2)}
</schema>

<|im_start|>user
{query}<|im_end|>
<|im_start|>assistant
"""
        
        response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
        results["shuffled_keys"].append(success)
        print(f"  πŸ”€ Shuffled: {'βœ“' if success else 'βœ—'}")
        
        # 3. Distractor text test
        schema = create_json_schema(base_schema)
        schema_with_distractor = add_distractor_text(json.dumps(base_schema, indent=2))
        prompt = f"""<|im_start|>system
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|>

<schema>
{schema_with_distractor}
</schema>

<|im_start|>user
{query}<|im_end|>
<|im_start|>assistant
"""
        
        response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
        results["with_distractors"].append(success)
        print(f"  🎭 Distractor: {'βœ“' if success else 'βœ—'}")
        
        # 4. Both shuffled and distractors
        shuffled_schema = shuffle_json_keys(base_schema)
        schema = create_json_schema(shuffled_schema)
        schema_with_distractor = add_distractor_text(json.dumps(shuffled_schema, indent=2))
        prompt = f"""<|im_start|>system
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|>

<schema>
{schema_with_distractor}
</schema>

<|im_start|>user
{query}<|im_end|>
<|im_start|>assistant
"""
        
        response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
        results["both_shuffled_and_distractors"].append(success)
        print(f"  πŸ”€πŸŽ­ Both: {'βœ“' if success else 'βœ—'}")
    
    # Calculate success rates
    print("\nπŸ“Š Robustness Test Results:")
    print("=" * 50)
    
    for test_name, test_results in results.items():
        success_rate = (sum(test_results) / len(test_results)) * 100
        print(f"{test_name.replace('_', ' ').title()}: {success_rate:.1f}% ({sum(test_results)}/{len(test_results)})")
    
    print("\n🎯 Analysis:")
    baseline_rate = (sum(results["baseline"]) / len(results["baseline"])) * 100
    
    for test_name, test_results in results.items():
        if test_name != "baseline":
            test_rate = (sum(test_results) / len(test_results)) * 100
            diff = test_rate - baseline_rate
            status = "🟒" if diff >= -10 else "🟑" if diff >= -20 else "πŸ”΄"
            print(f"{status} {test_name.replace('_', ' ').title()}: {diff:+.1f}% vs baseline")
    
    return results

if __name__ == "__main__":
    test_robustness()