#!/usr/bin/env python3
"""
replace_reserved_tokens.py

This script updates the reserved special tokens in a Hugging Face tokenizer directory.
It replaces the tokens with the following mapping:
  ID 128013: "<|think|>"
  ID 128014: "<|/think|>"
  ID 128015: "<|answer|>"
  ID 128016: "<|/answer|>"

It updates all key files if they exist: tokenizer_config.json, tokenizer.json,
added_tokens.json, and special_tokens_map.json.

Usage:
    python3 replace_reserved_tokens.py --tokenizer_dir /path/to/tokenizer_dir
A backup (.backup) of each file updated is created.
"""

import argparse
import json
import os
import sys

# Define the replacement mapping as a dictionary with keys as strings
# (these are the token IDs as stored in the JSON) and values as the new token content.
REPLACEMENT_TOKENS = {
    "128013": "<|think|>",
    "128014": "<|/think|>",
    "128015": "<|answer|>",
    "128016": "<|/answer|>"
}

def update_json_file(file_path, updater_func):
    """
    Load a JSON file, update it using updater_func(data), and if changes occur,
    backup the original file and write out the modified JSON.
    """
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return False
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        print(f"Error reading {file_path}: {e}", file=sys.stderr)
        return False

    changed = updater_func(data)
    if changed:
        backup_path = file_path + ".backup"
        os.rename(file_path, backup_path)
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        print(f"Updated '{file_path}'. Backup saved to '{backup_path}'.")
    else:
        print(f"No changes needed for '{file_path}'.")
    return changed

def update_tokenizer_config(data):
    """
    Update the "added_tokens_decoder" field in tokenizer_config.json.
    """
    changed = False
    if "added_tokens_decoder" in data:
        for token_id, new_content in REPLACEMENT_TOKENS.items():
            if token_id in data["added_tokens_decoder"]:
                current = data["added_tokens_decoder"][token_id].get("content", "")
                if current != new_content:
                    print(f"[tokenizer_config.json] Replacing token id {token_id}: '{current}' -> '{new_content}'")
                    data["added_tokens_decoder"][token_id]["content"] = new_content
                    changed = True
                else:
                    print(f"[tokenizer_config.json] Token id {token_id} already set to '{new_content}'.")
            else:
                print(f"[tokenizer_config.json] Warning: token id {token_id} not found.", file=sys.stderr)
    else:
        print("Key 'added_tokens_decoder' not found in tokenizer_config.json.", file=sys.stderr)
    return changed

def update_tokenizer_json(data):
    """
    Update the "added_tokens" list in tokenizer.json.
    The structure is assumed to be a dictionary that includes an "added_tokens" key.
    """
    changed = False
    if "added_tokens" in data and isinstance(data["added_tokens"], list):
        for token in data["added_tokens"]:
            # The token "id" might be an integer or string; compare after converting to string.
            token_id = str(token.get("id"))
            if token_id in REPLACEMENT_TOKENS:
                current = token.get("content", "")
                new_content = REPLACEMENT_TOKENS[token_id]
                if current != new_content:
                    print(f"[tokenizer.json] Replacing token id {token_id}: '{current}' -> '{new_content}'")
                    token["content"] = new_content
                    changed = True
                else:
                    print(f"[tokenizer.json] Token id {token_id} already set to '{new_content}'.")
    else:
        print("Key 'added_tokens' not found or not a list in tokenizer.json.", file=sys.stderr)
    return changed

def update_added_tokens_json(data):
    """
    Update the added_tokens.json file if it exists.
    Assume data is a dict mapping token IDs to token info.
    """
    changed = False
    for token_id, new_content in REPLACEMENT_TOKENS.items():
        if token_id in data:
            current = data[token_id].get("content", "")
            if current != new_content:
                print(f"[added_tokens.json] Replacing token id {token_id}: '{current}' -> '{new_content}'")
                data[token_id]["content"] = new_content
                changed = True
            else:
                print(f"[added_tokens.json] Token id {token_id} already set to '{new_content}'.")
        else:
            print(f"[added_tokens.json] Warning: token id {token_id} not found.", file=sys.stderr)
    return changed

def update_special_tokens_map(data):
    """
    Update special_tokens_map.json if needed.
    This file maps roles (e.g. bos_token) to token strings.
    If any of our replacement tokens appear here, update them.
    (Often, these reserved tokens are not referenced here,
    so this function may be a no-op.)
    """
    changed = False
    for key, value in data.items():
        if isinstance(value, str):
            # If the current value equals one of our original reserved tokens,
            # you might want to update it. However, without the original values,
            # we leave it unchanged unless needed.
            pass
    return changed

def main():
    parser = argparse.ArgumentParser(
        description="Replace reserved tokens in a Hugging Face tokenizer directory."
    )
    parser.add_argument(
        "--tokenizer_dir",
        type=str,
        required=True,
        help="Directory containing the tokenizer files (e.g., tokenizer_config.json, tokenizer.json, etc.)"
    )
    args = parser.parse_args()

    tokenizer_dir = args.tokenizer_dir
    if not os.path.isdir(tokenizer_dir):
        print(f"Error: Directory '{tokenizer_dir}' not found.", file=sys.stderr)
        sys.exit(1)

    # List of (filename, updater_function)
    files_to_update = [
        ("tokenizer_config.json", update_tokenizer_config),
        ("tokenizer.json", update_tokenizer_json),
        ("added_tokens.json", update_added_tokens_json),
        ("special_tokens_map.json", update_special_tokens_map),
    ]

    for filename, updater in files_to_update:
        file_path = os.path.join(tokenizer_dir, filename)
        if os.path.exists(file_path):
            print(f"\nProcessing '{filename}'...")
            update_json_file(file_path, updater)
        else:
            print(f"Skipping '{filename}': not found.")

if __name__ == "__main__":
    main()