#!/usr/bin/env python3 """ replace_reserved_tokens.py This script updates the reserved special tokens in a Hugging Face tokenizer directory. It replaces the tokens with the following mapping: ID 128013: "<|think|>" ID 128014: "<|/think|>" ID 128015: "<|answer|>" ID 128016: "<|/answer|>" It updates all key files if they exist: tokenizer_config.json, tokenizer.json, added_tokens.json, and special_tokens_map.json. Usage: python3 replace_reserved_tokens.py --tokenizer_dir /path/to/tokenizer_dir A backup (.backup) of each file updated is created. """ import argparse import json import os import sys # Define the replacement mapping as a dictionary with keys as strings # (these are the token IDs as stored in the JSON) and values as the new token content. REPLACEMENT_TOKENS = { "128013": "<|think|>", "128014": "<|/think|>", "128015": "<|answer|>", "128016": "<|/answer|>" } def update_json_file(file_path, updater_func): """ Load a JSON file, update it using updater_func(data), and if changes occur, backup the original file and write out the modified JSON. """ if not os.path.exists(file_path): print(f"File not found: {file_path}") return False try: with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) except Exception as e: print(f"Error reading {file_path}: {e}", file=sys.stderr) return False changed = updater_func(data) if changed: backup_path = file_path + ".backup" os.rename(file_path, backup_path) with open(file_path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f"Updated '{file_path}'. Backup saved to '{backup_path}'.") else: print(f"No changes needed for '{file_path}'.") return changed def update_tokenizer_config(data): """ Update the "added_tokens_decoder" field in tokenizer_config.json. """ changed = False if "added_tokens_decoder" in data: for token_id, new_content in REPLACEMENT_TOKENS.items(): if token_id in data["added_tokens_decoder"]: current = data["added_tokens_decoder"][token_id].get("content", "") if current != new_content: print(f"[tokenizer_config.json] Replacing token id {token_id}: '{current}' -> '{new_content}'") data["added_tokens_decoder"][token_id]["content"] = new_content changed = True else: print(f"[tokenizer_config.json] Token id {token_id} already set to '{new_content}'.") else: print(f"[tokenizer_config.json] Warning: token id {token_id} not found.", file=sys.stderr) else: print("Key 'added_tokens_decoder' not found in tokenizer_config.json.", file=sys.stderr) return changed def update_tokenizer_json(data): """ Update the "added_tokens" list in tokenizer.json. The structure is assumed to be a dictionary that includes an "added_tokens" key. """ changed = False if "added_tokens" in data and isinstance(data["added_tokens"], list): for token in data["added_tokens"]: # The token "id" might be an integer or string; compare after converting to string. token_id = str(token.get("id")) if token_id in REPLACEMENT_TOKENS: current = token.get("content", "") new_content = REPLACEMENT_TOKENS[token_id] if current != new_content: print(f"[tokenizer.json] Replacing token id {token_id}: '{current}' -> '{new_content}'") token["content"] = new_content changed = True else: print(f"[tokenizer.json] Token id {token_id} already set to '{new_content}'.") else: print("Key 'added_tokens' not found or not a list in tokenizer.json.", file=sys.stderr) return changed def update_added_tokens_json(data): """ Update the added_tokens.json file if it exists. Assume data is a dict mapping token IDs to token info. """ changed = False for token_id, new_content in REPLACEMENT_TOKENS.items(): if token_id in data: current = data[token_id].get("content", "") if current != new_content: print(f"[added_tokens.json] Replacing token id {token_id}: '{current}' -> '{new_content}'") data[token_id]["content"] = new_content changed = True else: print(f"[added_tokens.json] Token id {token_id} already set to '{new_content}'.") else: print(f"[added_tokens.json] Warning: token id {token_id} not found.", file=sys.stderr) return changed def update_special_tokens_map(data): """ Update special_tokens_map.json if needed. This file maps roles (e.g. bos_token) to token strings. If any of our replacement tokens appear here, update them. (Often, these reserved tokens are not referenced here, so this function may be a no-op.) """ changed = False for key, value in data.items(): if isinstance(value, str): # If the current value equals one of our original reserved tokens, # you might want to update it. However, without the original values, # we leave it unchanged unless needed. pass return changed def main(): parser = argparse.ArgumentParser( description="Replace reserved tokens in a Hugging Face tokenizer directory." ) parser.add_argument( "--tokenizer_dir", type=str, required=True, help="Directory containing the tokenizer files (e.g., tokenizer_config.json, tokenizer.json, etc.)" ) args = parser.parse_args() tokenizer_dir = args.tokenizer_dir if not os.path.isdir(tokenizer_dir): print(f"Error: Directory '{tokenizer_dir}' not found.", file=sys.stderr) sys.exit(1) # List of (filename, updater_function) files_to_update = [ ("tokenizer_config.json", update_tokenizer_config), ("tokenizer.json", update_tokenizer_json), ("added_tokens.json", update_added_tokens_json), ("special_tokens_map.json", update_special_tokens_map), ] for filename, updater in files_to_update: file_path = os.path.join(tokenizer_dir, filename) if os.path.exists(file_path): print(f"\nProcessing '{filename}'...") update_json_file(file_path, updater) else: print(f"Skipping '{filename}': not found.") if __name__ == "__main__": main()