""" Robustness Testing for Dynamic Function-Calling Agent Tests model stability with: 1. Shuffled JSON key order 2. Distractor text before schema 3. Noisy prompts Quick test that doesn't require retraining. """ import json import random from test_constrained_model import load_trained_model, constrained_json_generate, create_json_schema def shuffle_json_keys(obj): """Recursively shuffle the order of keys in JSON objects""" if isinstance(obj, dict): items = list(obj.items()) random.shuffle(items) return {k: shuffle_json_keys(v) for k, v in items} elif isinstance(obj, list): return [shuffle_json_keys(item) for item in obj] return obj def add_distractor_text(schema_str): """Add distracting text before the schema""" distractors = [ "Note: This is a complex API with many parameters.", "Important: Please review all requirements carefully.", "Warning: Some fields may be optional depending on context.", "Info: This function supports multiple data formats.", "Reminder: Check authentication before making calls." ] distractor = random.choice(distractors) return f"{distractor}\n\n{schema_str}" def test_robustness(): """Run robustness tests on the function calling agent""" print("๐Ÿงช Starting Robustness Tests...") # Load model model, tokenizer = load_trained_model() # Test schema base_schema = { "name": "get_weather_forecast", "description": "Get weather forecast for a location", "parameters": { "type": "object", "properties": { "location": {"type": "string", "description": "City name"}, "days": {"type": "integer", "description": "Number of days", "minimum": 1}, "units": {"type": "string", "enum": ["metric", "imperial"]}, "include_hourly": {"type": "boolean", "default": False} }, "required": ["location", "days"] } } test_queries = [ "Get 3-day weather for Paris", "Weather forecast for Tokyo, 5 days, metric units", "I need the weather for London for the next week" ] results = { "baseline": [], "shuffled_keys": [], "with_distractors": [], "both_shuffled_and_distractors": [] } print("\n๐Ÿ” Running test scenarios...") for query in test_queries: print(f"\n๐Ÿ“ Query: '{query}'") # 1. Baseline test schema = create_json_schema(base_schema) prompt = f"""<|im_start|>system You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> {json.dumps(base_schema, indent=2)} <|im_start|>user {query}<|im_end|> <|im_start|>assistant """ response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) results["baseline"].append(success) print(f" โœ… Baseline: {'โœ“' if success else 'โœ—'}") # 2. Shuffled keys test shuffled_schema = shuffle_json_keys(base_schema) schema = create_json_schema(shuffled_schema) prompt = f"""<|im_start|>system You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> {json.dumps(shuffled_schema, indent=2)} <|im_start|>user {query}<|im_end|> <|im_start|>assistant """ response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) results["shuffled_keys"].append(success) print(f" ๐Ÿ”€ Shuffled: {'โœ“' if success else 'โœ—'}") # 3. Distractor text test schema = create_json_schema(base_schema) schema_with_distractor = add_distractor_text(json.dumps(base_schema, indent=2)) prompt = f"""<|im_start|>system You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> {schema_with_distractor} <|im_start|>user {query}<|im_end|> <|im_start|>assistant """ response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) results["with_distractors"].append(success) print(f" ๐ŸŽญ Distractor: {'โœ“' if success else 'โœ—'}") # 4. Both shuffled and distractors shuffled_schema = shuffle_json_keys(base_schema) schema = create_json_schema(shuffled_schema) schema_with_distractor = add_distractor_text(json.dumps(shuffled_schema, indent=2)) prompt = f"""<|im_start|>system You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> {schema_with_distractor} <|im_start|>user {query}<|im_end|> <|im_start|>assistant """ response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) results["both_shuffled_and_distractors"].append(success) print(f" ๐Ÿ”€๐ŸŽญ Both: {'โœ“' if success else 'โœ—'}") # Calculate success rates print("\n๐Ÿ“Š Robustness Test Results:") print("=" * 50) for test_name, test_results in results.items(): success_rate = (sum(test_results) / len(test_results)) * 100 print(f"{test_name.replace('_', ' ').title()}: {success_rate:.1f}% ({sum(test_results)}/{len(test_results)})") print("\n๐ŸŽฏ Analysis:") baseline_rate = (sum(results["baseline"]) / len(results["baseline"])) * 100 for test_name, test_results in results.items(): if test_name != "baseline": test_rate = (sum(test_results) / len(test_results)) * 100 diff = test_rate - baseline_rate status = "๐ŸŸข" if diff >= -10 else "๐ŸŸก" if diff >= -20 else "๐Ÿ”ด" print(f"{status} {test_name.replace('_', ' ').title()}: {diff:+.1f}% vs baseline") return results if __name__ == "__main__": test_robustness()