File size: 6,269 Bytes
6639f75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
"""
Robustness Testing for Dynamic Function-Calling Agent
Tests model stability with:
1. Shuffled JSON key order
2. Distractor text before schema
3. Noisy prompts
Quick test that doesn't require retraining.
"""
import json
import random
from test_constrained_model import load_trained_model, constrained_json_generate, create_json_schema
def shuffle_json_keys(obj):
"""Recursively shuffle the order of keys in JSON objects"""
if isinstance(obj, dict):
items = list(obj.items())
random.shuffle(items)
return {k: shuffle_json_keys(v) for k, v in items}
elif isinstance(obj, list):
return [shuffle_json_keys(item) for item in obj]
return obj
def add_distractor_text(schema_str):
"""Add distracting text before the schema"""
distractors = [
"Note: This is a complex API with many parameters.",
"Important: Please review all requirements carefully.",
"Warning: Some fields may be optional depending on context.",
"Info: This function supports multiple data formats.",
"Reminder: Check authentication before making calls."
]
distractor = random.choice(distractors)
return f"{distractor}\n\n{schema_str}"
def test_robustness():
"""Run robustness tests on the function calling agent"""
print("π§ͺ Starting Robustness Tests...")
# Load model
model, tokenizer = load_trained_model()
# Test schema
base_schema = {
"name": "get_weather_forecast",
"description": "Get weather forecast for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string", "description": "City name"},
"days": {"type": "integer", "description": "Number of days", "minimum": 1},
"units": {"type": "string", "enum": ["metric", "imperial"]},
"include_hourly": {"type": "boolean", "default": False}
},
"required": ["location", "days"]
}
}
test_queries = [
"Get 3-day weather for Paris",
"Weather forecast for Tokyo, 5 days, metric units",
"I need the weather for London for the next week"
]
results = {
"baseline": [],
"shuffled_keys": [],
"with_distractors": [],
"both_shuffled_and_distractors": []
}
print("\nπ Running test scenarios...")
for query in test_queries:
print(f"\nπ Query: '{query}'")
# 1. Baseline test
schema = create_json_schema(base_schema)
prompt = f"""<|im_start|>system
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|>
<schema>
{json.dumps(base_schema, indent=2)}
</schema>
<|im_start|>user
{query}<|im_end|>
<|im_start|>assistant
"""
response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
results["baseline"].append(success)
print(f" β
Baseline: {'β' if success else 'β'}")
# 2. Shuffled keys test
shuffled_schema = shuffle_json_keys(base_schema)
schema = create_json_schema(shuffled_schema)
prompt = f"""<|im_start|>system
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|>
<schema>
{json.dumps(shuffled_schema, indent=2)}
</schema>
<|im_start|>user
{query}<|im_end|>
<|im_start|>assistant
"""
response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
results["shuffled_keys"].append(success)
print(f" π Shuffled: {'β' if success else 'β'}")
# 3. Distractor text test
schema = create_json_schema(base_schema)
schema_with_distractor = add_distractor_text(json.dumps(base_schema, indent=2))
prompt = f"""<|im_start|>system
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|>
<schema>
{schema_with_distractor}
</schema>
<|im_start|>user
{query}<|im_end|>
<|im_start|>assistant
"""
response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
results["with_distractors"].append(success)
print(f" π Distractor: {'β' if success else 'β'}")
# 4. Both shuffled and distractors
shuffled_schema = shuffle_json_keys(base_schema)
schema = create_json_schema(shuffled_schema)
schema_with_distractor = add_distractor_text(json.dumps(shuffled_schema, indent=2))
prompt = f"""<|im_start|>system
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|>
<schema>
{schema_with_distractor}
</schema>
<|im_start|>user
{query}<|im_end|>
<|im_start|>assistant
"""
response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
results["both_shuffled_and_distractors"].append(success)
print(f" ππ Both: {'β' if success else 'β'}")
# Calculate success rates
print("\nπ Robustness Test Results:")
print("=" * 50)
for test_name, test_results in results.items():
success_rate = (sum(test_results) / len(test_results)) * 100
print(f"{test_name.replace('_', ' ').title()}: {success_rate:.1f}% ({sum(test_results)}/{len(test_results)})")
print("\nπ― Analysis:")
baseline_rate = (sum(results["baseline"]) / len(results["baseline"])) * 100
for test_name, test_results in results.items():
if test_name != "baseline":
test_rate = (sum(test_results) / len(test_results)) * 100
diff = test_rate - baseline_rate
status = "π’" if diff >= -10 else "π‘" if diff >= -20 else "π΄"
print(f"{status} {test_name.replace('_', ' ').title()}: {diff:+.1f}% vs baseline")
return results
if __name__ == "__main__":
test_robustness() |