|
""" |
|
generate_json_syntax_training.py - Ultra-Focused JSON Syntax Training |
|
|
|
This script creates training data specifically targeting the "Expecting ',' delimiter" |
|
errors that are the root cause of our 93% failure rate. |
|
|
|
Analysis of failures shows the model has issues with: |
|
1. String parameters containing quotes and special characters |
|
2. Proper JSON object structure and comma placement |
|
3. Consistent quote escaping in nested parameters |
|
""" |
|
|
|
import json |
|
import random |
|
from typing import List, Dict, Any |
|
|
|
def create_training_pair(schema: Dict, question: str, good_response: str, bad_response: str) -> Dict: |
|
"""Create a single training pair focused on JSON syntax.""" |
|
prompt = f"""<|im_start|>system |
|
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
|
|
|
<schema> |
|
{json.dumps(schema, indent=2)} |
|
</schema> |
|
|
|
<|im_start|>user |
|
{question}<|im_end|> |
|
<|im_start|>assistant |
|
""" |
|
|
|
return { |
|
"prompt": prompt, |
|
"chosen": good_response, |
|
"rejected": bad_response |
|
} |
|
|
|
def generate_simple_json_patterns(): |
|
"""Generate basic JSON structure patterns to establish fundamentals.""" |
|
examples = [] |
|
|
|
|
|
examples.append(create_training_pair( |
|
{ |
|
"name": "simple_function", |
|
"description": "Simple function with one parameter", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"text": {"type": "string"} |
|
}, |
|
"required": ["text"] |
|
} |
|
}, |
|
"Call with hello world", |
|
'{"name": "simple_function", "arguments": {"text": "hello world"}}', |
|
"I'll call the function with hello world" |
|
)) |
|
|
|
|
|
examples.append(create_training_pair( |
|
{ |
|
"name": "two_param_function", |
|
"description": "Function with two parameters", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"name": {"type": "string"}, |
|
"age": {"type": "integer"} |
|
}, |
|
"required": ["name", "age"] |
|
} |
|
}, |
|
"Call with name John and age 25", |
|
'{"name": "two_param_function", "arguments": {"name": "John", "age": 25}}', |
|
'{"name": "two_param_function", "arguments": {"name": "John" "age": 25}}' |
|
)) |
|
|
|
return examples |
|
|
|
def generate_string_escaping_patterns(): |
|
"""Generate patterns specifically for string parameter handling.""" |
|
examples = [] |
|
|
|
|
|
examples.append(create_training_pair( |
|
{ |
|
"name": "analyze_text", |
|
"description": "Analyze text content", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"content": {"type": "string"}, |
|
"type": {"type": "string"} |
|
}, |
|
"required": ["content", "type"] |
|
} |
|
}, |
|
"Analyze this text: The CEO said we have made tremendous progress this quarter", |
|
'{"name": "analyze_text", "arguments": {"content": "The CEO said we have made tremendous progress this quarter", "type": "analysis"}}', |
|
'I will analyze that text for you' |
|
)) |
|
|
|
|
|
examples.append(create_training_pair( |
|
{ |
|
"name": "send_message", |
|
"description": "Send a message", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"to": {"type": "string"}, |
|
"subject": {"type": "string"}, |
|
"body": {"type": "string"} |
|
}, |
|
"required": ["to", "subject", "body"] |
|
} |
|
}, |
|
"Send email to [email protected] with subject Meeting Update and body The meeting has been rescheduled to tomorrow at 2 PM", |
|
'{"name": "send_message", "arguments": {"to": "[email protected]", "subject": "Meeting Update", "body": "The meeting has been rescheduled to tomorrow at 2 PM"}}', |
|
'I will send that email for you' |
|
)) |
|
|
|
|
|
examples.append(create_training_pair( |
|
{ |
|
"name": "process_query", |
|
"description": "Process database query", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"query": {"type": "string"}, |
|
"database": {"type": "string"} |
|
}, |
|
"required": ["query", "database"] |
|
} |
|
}, |
|
"Run query SELECT name FROM users WHERE created_at > 2023-01-01 on the main database", |
|
'{"name": "process_query", "arguments": {"query": "SELECT name FROM users WHERE created_at > 2023-01-01", "database": "main"}}', |
|
'I will run that database query for you' |
|
)) |
|
|
|
return examples |
|
|
|
def generate_complex_parameter_patterns(): |
|
"""Generate patterns for complex parameter combinations.""" |
|
examples = [] |
|
|
|
|
|
examples.append(create_training_pair( |
|
{ |
|
"name": "configure_system", |
|
"description": "Configure system settings", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"timeout": {"type": "integer"}, |
|
"enabled": {"type": "boolean"}, |
|
"level": {"type": "string"} |
|
}, |
|
"required": ["timeout", "enabled"] |
|
} |
|
}, |
|
"Set timeout to 30 seconds, enable the system, and set level to debug", |
|
'{"name": "configure_system", "arguments": {"timeout": 30, "enabled": true, "level": "debug"}}', |
|
'I will configure the system with those settings' |
|
)) |
|
|
|
|
|
examples.append(create_training_pair( |
|
{ |
|
"name": "process_files", |
|
"description": "Process multiple files", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"files": {"type": "array", "items": {"type": "string"}}, |
|
"operation": {"type": "string"} |
|
}, |
|
"required": ["files", "operation"] |
|
} |
|
}, |
|
"Process files data.csv, results.json, and report.pdf with merge operation", |
|
'{"name": "process_files", "arguments": {"files": ["data.csv", "results.json", "report.pdf"], "operation": "merge"}}', |
|
'I will process those files for you' |
|
)) |
|
|
|
return examples |
|
|
|
def generate_exact_failure_patterns(): |
|
"""Generate training examples that exactly match our failing schemas.""" |
|
examples = [] |
|
|
|
|
|
examples.append(create_training_pair( |
|
{ |
|
"name": "summarize_document", |
|
"description": "Summarize document content", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"document_url": {"type": "string"}, |
|
"summary_length": {"type": "string"}, |
|
"target_audience": {"type": "string"} |
|
}, |
|
"required": ["document_url"] |
|
} |
|
}, |
|
"Summarize the document at https://example.com/report.pdf for executives with brief length", |
|
'{"name": "summarize_document", "arguments": {"document_url": "https://example.com/report.pdf", "summary_length": "brief", "target_audience": "executive"}}', |
|
'I will summarize that document for executives' |
|
)) |
|
|
|
|
|
examples.append(create_training_pair( |
|
{ |
|
"name": "analyze_sentiment", |
|
"description": "Analyze text sentiment", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"text": {"type": "string"}, |
|
"language": {"type": "string"}, |
|
"include_emotions": {"type": "boolean"} |
|
}, |
|
"required": ["text"] |
|
} |
|
}, |
|
"Analyze sentiment of this text: The product was excellent and delivery was fast with emotion details in English", |
|
'{"name": "analyze_sentiment", "arguments": {"text": "The product was excellent and delivery was fast", "language": "en", "include_emotions": true}}', |
|
'I will analyze the sentiment of that text' |
|
)) |
|
|
|
|
|
examples.append(create_training_pair( |
|
{ |
|
"name": "get_weather_forecast", |
|
"description": "Get weather forecast", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"location": {"type": "string"}, |
|
"days": {"type": "integer"}, |
|
"units": {"type": "string"}, |
|
"include_hourly": {"type": "boolean"} |
|
}, |
|
"required": ["location", "days"] |
|
} |
|
}, |
|
"Get 3-day weather forecast for New York in metric units with hourly details", |
|
'{"name": "get_weather_forecast", "arguments": {"location": "New York", "days": 3, "units": "metric", "include_hourly": true}}', |
|
'I will get the weather forecast for New York' |
|
)) |
|
|
|
|
|
examples.append(create_training_pair( |
|
{ |
|
"name": "convert_currency", |
|
"description": "Convert currency amounts", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"amount": {"type": "number"}, |
|
"from_currency": {"type": "string"}, |
|
"to_currency": {"type": "string"}, |
|
"include_fees": {"type": "boolean"} |
|
}, |
|
"required": ["amount", "from_currency", "to_currency"] |
|
} |
|
}, |
|
"Convert 100 US dollars to Euros with fees included", |
|
'{"name": "convert_currency", "arguments": {"amount": 100, "from_currency": "USD", "to_currency": "EUR", "include_fees": true}}', |
|
'I will convert that currency amount for you' |
|
)) |
|
|
|
|
|
examples.append(create_training_pair( |
|
{ |
|
"name": "optimize_database_query", |
|
"description": "Optimize database query", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"sql_query": {"type": "string"}, |
|
"database_type": {"type": "string"}, |
|
"performance_target": {"type": "string"} |
|
}, |
|
"required": ["sql_query", "database_type"] |
|
} |
|
}, |
|
"Optimize this MySQL query for speed: SELECT id, name FROM users WHERE active = 1", |
|
'{"name": "optimize_database_query", "arguments": {"sql_query": "SELECT id, name FROM users WHERE active = 1", "database_type": "mysql", "performance_target": "speed"}}', |
|
'I will optimize that database query for you' |
|
)) |
|
|
|
return examples |
|
|
|
def main(): |
|
"""Generate ultra-focused JSON syntax training dataset.""" |
|
print("π― Generating Ultra-Focused JSON Syntax Training...") |
|
|
|
all_examples = [] |
|
|
|
|
|
print("π Adding simple JSON patterns...") |
|
base_examples = generate_simple_json_patterns() |
|
all_examples.extend(base_examples) |
|
|
|
print("π Adding string escaping patterns...") |
|
string_examples = generate_string_escaping_patterns() |
|
all_examples.extend(string_examples) |
|
|
|
print("π Adding complex parameter patterns...") |
|
complex_examples = generate_complex_parameter_patterns() |
|
all_examples.extend(complex_examples) |
|
|
|
print("π Adding exact failure patterns...") |
|
failure_examples = generate_exact_failure_patterns() |
|
all_examples.extend(failure_examples) |
|
|
|
|
|
print("π Adding 10x repetitions of exact failure patterns...") |
|
for _ in range(10): |
|
all_examples.extend(failure_examples) |
|
all_examples.extend(string_examples) |
|
all_examples.extend(complex_examples) |
|
|
|
|
|
output_file = "tool_pairs_json_syntax.jsonl" |
|
with open(output_file, 'w') as f: |
|
for example in all_examples: |
|
f.write(json.dumps(example) + '\n') |
|
|
|
print(f"β
Generated {len(all_examples)} ultra-focused training examples") |
|
print(f"πΎ Saved to {output_file}") |
|
|
|
|
|
categories = { |
|
"Simple JSON patterns": len(base_examples), |
|
"String escaping patterns": len(string_examples) * 11, |
|
"Complex parameters": len(complex_examples) * 11, |
|
"Exact failure patterns": len(failure_examples) * 11 |
|
} |
|
|
|
print(f"\nπ Ultra-Focused Training Composition:") |
|
for category, count in categories.items(): |
|
print(f" {category}: {count} examples") |
|
|
|
print(f"\nπ― Ultra-Focused Approach:") |
|
print(f" β’ 11x repetition of exact failing patterns") |
|
print(f" β’ Progressive complexity from simple to exact failures") |
|
print(f" β’ JSON syntax comma and quote handling emphasis") |
|
print(f" β’ Directly targeting 'Expecting , delimiter' errors") |
|
|
|
return len(all_examples) |
|
|
|
if __name__ == "__main__": |
|
main() |