|
""" |
|
Robustness Testing for Dynamic Function-Calling Agent |
|
|
|
Tests model stability with: |
|
1. Shuffled JSON key order |
|
2. Distractor text before schema |
|
3. Noisy prompts |
|
|
|
Quick test that doesn't require retraining. |
|
""" |
|
|
|
import json |
|
import random |
|
from test_constrained_model import load_trained_model, constrained_json_generate, create_json_schema |
|
|
|
def shuffle_json_keys(obj): |
|
"""Recursively shuffle the order of keys in JSON objects""" |
|
if isinstance(obj, dict): |
|
items = list(obj.items()) |
|
random.shuffle(items) |
|
return {k: shuffle_json_keys(v) for k, v in items} |
|
elif isinstance(obj, list): |
|
return [shuffle_json_keys(item) for item in obj] |
|
return obj |
|
|
|
def add_distractor_text(schema_str): |
|
"""Add distracting text before the schema""" |
|
distractors = [ |
|
"Note: This is a complex API with many parameters.", |
|
"Important: Please review all requirements carefully.", |
|
"Warning: Some fields may be optional depending on context.", |
|
"Info: This function supports multiple data formats.", |
|
"Reminder: Check authentication before making calls." |
|
] |
|
distractor = random.choice(distractors) |
|
return f"{distractor}\n\n{schema_str}" |
|
|
|
def test_robustness(): |
|
"""Run robustness tests on the function calling agent""" |
|
print("π§ͺ Starting Robustness Tests...") |
|
|
|
|
|
model, tokenizer = load_trained_model() |
|
|
|
|
|
base_schema = { |
|
"name": "get_weather_forecast", |
|
"description": "Get weather forecast for a location", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"location": {"type": "string", "description": "City name"}, |
|
"days": {"type": "integer", "description": "Number of days", "minimum": 1}, |
|
"units": {"type": "string", "enum": ["metric", "imperial"]}, |
|
"include_hourly": {"type": "boolean", "default": False} |
|
}, |
|
"required": ["location", "days"] |
|
} |
|
} |
|
|
|
test_queries = [ |
|
"Get 3-day weather for Paris", |
|
"Weather forecast for Tokyo, 5 days, metric units", |
|
"I need the weather for London for the next week" |
|
] |
|
|
|
results = { |
|
"baseline": [], |
|
"shuffled_keys": [], |
|
"with_distractors": [], |
|
"both_shuffled_and_distractors": [] |
|
} |
|
|
|
print("\nπ Running test scenarios...") |
|
|
|
for query in test_queries: |
|
print(f"\nπ Query: '{query}'") |
|
|
|
|
|
schema = create_json_schema(base_schema) |
|
prompt = f"""<|im_start|>system |
|
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
|
|
|
<schema> |
|
{json.dumps(base_schema, indent=2)} |
|
</schema> |
|
|
|
<|im_start|>user |
|
{query}<|im_end|> |
|
<|im_start|>assistant |
|
""" |
|
|
|
response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) |
|
results["baseline"].append(success) |
|
print(f" β
Baseline: {'β' if success else 'β'}") |
|
|
|
|
|
shuffled_schema = shuffle_json_keys(base_schema) |
|
schema = create_json_schema(shuffled_schema) |
|
prompt = f"""<|im_start|>system |
|
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
|
|
|
<schema> |
|
{json.dumps(shuffled_schema, indent=2)} |
|
</schema> |
|
|
|
<|im_start|>user |
|
{query}<|im_end|> |
|
<|im_start|>assistant |
|
""" |
|
|
|
response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) |
|
results["shuffled_keys"].append(success) |
|
print(f" π Shuffled: {'β' if success else 'β'}") |
|
|
|
|
|
schema = create_json_schema(base_schema) |
|
schema_with_distractor = add_distractor_text(json.dumps(base_schema, indent=2)) |
|
prompt = f"""<|im_start|>system |
|
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
|
|
|
<schema> |
|
{schema_with_distractor} |
|
</schema> |
|
|
|
<|im_start|>user |
|
{query}<|im_end|> |
|
<|im_start|>assistant |
|
""" |
|
|
|
response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) |
|
results["with_distractors"].append(success) |
|
print(f" π Distractor: {'β' if success else 'β'}") |
|
|
|
|
|
shuffled_schema = shuffle_json_keys(base_schema) |
|
schema = create_json_schema(shuffled_schema) |
|
schema_with_distractor = add_distractor_text(json.dumps(shuffled_schema, indent=2)) |
|
prompt = f"""<|im_start|>system |
|
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
|
|
|
<schema> |
|
{schema_with_distractor} |
|
</schema> |
|
|
|
<|im_start|>user |
|
{query}<|im_end|> |
|
<|im_start|>assistant |
|
""" |
|
|
|
response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) |
|
results["both_shuffled_and_distractors"].append(success) |
|
print(f" ππ Both: {'β' if success else 'β'}") |
|
|
|
|
|
print("\nπ Robustness Test Results:") |
|
print("=" * 50) |
|
|
|
for test_name, test_results in results.items(): |
|
success_rate = (sum(test_results) / len(test_results)) * 100 |
|
print(f"{test_name.replace('_', ' ').title()}: {success_rate:.1f}% ({sum(test_results)}/{len(test_results)})") |
|
|
|
print("\nπ― Analysis:") |
|
baseline_rate = (sum(results["baseline"]) / len(results["baseline"])) * 100 |
|
|
|
for test_name, test_results in results.items(): |
|
if test_name != "baseline": |
|
test_rate = (sum(test_results) / len(test_results)) * 100 |
|
diff = test_rate - baseline_rate |
|
status = "π’" if diff >= -10 else "π‘" if diff >= -20 else "π΄" |
|
print(f"{status} {test_name.replace('_', ' ').title()}: {diff:+.1f}% vs baseline") |
|
|
|
return results |
|
|
|
if __name__ == "__main__": |
|
test_robustness() |