|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Tokenize Dataset Script: Prepare Tool Calling Dataset for Training |
|
|
|
|
|
This script tokenizes the nvidia/Nemotron-Agentic-v1 tool_calling dataset |
|
|
and uploads it to HuggingFace Hub for reuse. |
|
|
|
|
|
Usage: |
|
|
uv run tokenize_dataset.py |
|
|
|
|
|
Can run on CPU - no GPU required! |
|
|
""" |
|
|
|
|
|
import os |
|
|
import json |
|
|
from datasets import load_dataset, Dataset |
|
|
from transformers import AutoTokenizer |
|
|
from huggingface_hub import hf_hub_download, HfApi, create_repo |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BASE_MODEL = "Tesslate/Synthia-S1-27b" |
|
|
|
|
|
|
|
|
DATASET_NAME = "nvidia/Nemotron-Agentic-v1" |
|
|
DATASET_SPLIT = "tool_calling" |
|
|
|
|
|
|
|
|
TOKENIZED_DATASET_REPO = "Codyfederer/synthia-tool-calling-tokenized" |
|
|
TOKENIZED_DATASET_PRIVATE = True |
|
|
|
|
|
|
|
|
MAX_SEQ_LENGTH = 4096 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def tokenize_conversation(example, tokenizer, max_length): |
|
|
""" |
|
|
Tokenize a conversation using the model's chat template. |
|
|
Returns input_ids, attention_mask, and labels for causal LM training. |
|
|
""" |
|
|
messages = example["messages"] |
|
|
|
|
|
|
|
|
text = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=False |
|
|
) |
|
|
|
|
|
|
|
|
tokenized = tokenizer( |
|
|
text, |
|
|
truncation=True, |
|
|
max_length=max_length, |
|
|
padding=False, |
|
|
return_tensors=None, |
|
|
) |
|
|
|
|
|
|
|
|
tokenized["labels"] = tokenized["input_ids"].copy() |
|
|
|
|
|
return tokenized |
|
|
|
|
|
|
|
|
def main(): |
|
|
print("=" * 60) |
|
|
print("Tokenize Dataset for Tool Calling Training") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
from huggingface_hub import whoami |
|
|
try: |
|
|
username = whoami()["name"] |
|
|
print(f"Logged in as: {username}") |
|
|
except Exception as e: |
|
|
print(f"ERROR: Not logged in to HF Hub ({e})") |
|
|
print("Run 'huggingface-cli login' first") |
|
|
return |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\nLoading tokenizer from {BASE_MODEL}...") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
|
BASE_MODEL, |
|
|
trust_remote_code=True, |
|
|
padding_side="right", |
|
|
) |
|
|
|
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
tokenizer.pad_token_id = tokenizer.eos_token_id |
|
|
|
|
|
print(f"Vocab size: {len(tokenizer):,}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\nLoading dataset: {DATASET_NAME} ({DATASET_SPLIT} split)...") |
|
|
|
|
|
|
|
|
jsonl_file = f"data/{DATASET_SPLIT}.jsonl" |
|
|
print(f"Downloading {jsonl_file}...") |
|
|
|
|
|
local_path = hf_hub_download( |
|
|
repo_id=DATASET_NAME, |
|
|
filename=jsonl_file, |
|
|
repo_type="dataset" |
|
|
) |
|
|
print(f"Downloaded to: {local_path}") |
|
|
|
|
|
|
|
|
print("Loading and processing JSONL file...") |
|
|
processed_examples = [] |
|
|
skipped = 0 |
|
|
|
|
|
with open(local_path, 'r', encoding='utf-8') as f: |
|
|
for line_num, line in enumerate(f): |
|
|
if line_num % 50000 == 0: |
|
|
print(f" Processed {line_num:,} lines...") |
|
|
try: |
|
|
example = json.loads(line.strip()) |
|
|
messages = example.get("messages", []) |
|
|
|
|
|
|
|
|
formatted_messages = [] |
|
|
for msg in messages: |
|
|
role = msg.get("role", "user") |
|
|
content = msg.get("content", "") |
|
|
|
|
|
|
|
|
if isinstance(content, list): |
|
|
parts = [] |
|
|
for item in content: |
|
|
if isinstance(item, dict): |
|
|
if "text" in item: |
|
|
parts.append(item["text"]) |
|
|
else: |
|
|
parts.append(json.dumps(item)) |
|
|
else: |
|
|
parts.append(str(item)) |
|
|
content = "\n".join(parts) if parts else "" |
|
|
elif isinstance(content, dict): |
|
|
content = json.dumps(content) |
|
|
elif content is None: |
|
|
content = "" |
|
|
else: |
|
|
content = str(content) |
|
|
|
|
|
formatted_messages.append({ |
|
|
"role": role, |
|
|
"content": content |
|
|
}) |
|
|
|
|
|
|
|
|
if formatted_messages: |
|
|
merged_messages = [] |
|
|
for msg in formatted_messages: |
|
|
role = msg["role"] |
|
|
content = msg["content"] |
|
|
|
|
|
|
|
|
if role == "tool": |
|
|
role = "user" |
|
|
content = f"[Tool Result]\n{content}" |
|
|
|
|
|
if merged_messages and merged_messages[-1]["role"] == role: |
|
|
merged_messages[-1]["content"] += f"\n\n{content}" |
|
|
else: |
|
|
merged_messages.append({"role": role, "content": content}) |
|
|
|
|
|
|
|
|
if merged_messages and merged_messages[0]["role"] != "user": |
|
|
merged_messages.insert(0, {"role": "user", "content": "[Start]"}) |
|
|
|
|
|
processed_examples.append({"messages": merged_messages}) |
|
|
|
|
|
except Exception as e: |
|
|
skipped += 1 |
|
|
if skipped < 5: |
|
|
print(f" Warning: Skipped line {line_num}: {e}") |
|
|
|
|
|
print(f"Loaded {len(processed_examples):,} examples (skipped {skipped})") |
|
|
|
|
|
|
|
|
dataset = Dataset.from_list(processed_examples) |
|
|
print(f"Dataset size: {len(dataset):,} examples") |
|
|
|
|
|
|
|
|
split_dataset = dataset.train_test_split(test_size=0.02, seed=42) |
|
|
train_dataset = split_dataset["train"] |
|
|
eval_dataset = split_dataset["test"] |
|
|
|
|
|
print(f"Train samples: {len(train_dataset):,}") |
|
|
print(f"Eval samples: {len(eval_dataset):,}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\nTokenizing dataset with max_length={MAX_SEQ_LENGTH}...") |
|
|
print("This may take a while for large datasets...") |
|
|
|
|
|
train_dataset = train_dataset.map( |
|
|
lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH), |
|
|
remove_columns=["messages"], |
|
|
num_proc=1, |
|
|
desc="Tokenizing train", |
|
|
) |
|
|
|
|
|
eval_dataset = eval_dataset.map( |
|
|
lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH), |
|
|
remove_columns=["messages"], |
|
|
num_proc=1, |
|
|
desc="Tokenizing eval", |
|
|
) |
|
|
|
|
|
print(f"Tokenization complete!") |
|
|
print(f"Train dataset columns: {train_dataset.column_names}") |
|
|
print(f"Sample input_ids length: {len(train_dataset[0]['input_ids'])}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\nUploading TOKENIZED dataset to Hub: {TOKENIZED_DATASET_REPO}") |
|
|
|
|
|
|
|
|
api = HfApi() |
|
|
try: |
|
|
create_repo( |
|
|
TOKENIZED_DATASET_REPO, |
|
|
repo_type="dataset", |
|
|
private=TOKENIZED_DATASET_PRIVATE, |
|
|
exist_ok=True |
|
|
) |
|
|
print(f" Created/verified repo (private={TOKENIZED_DATASET_PRIVATE})") |
|
|
|
|
|
if TOKENIZED_DATASET_PRIVATE: |
|
|
try: |
|
|
api.update_repo_visibility( |
|
|
TOKENIZED_DATASET_REPO, |
|
|
repo_type="dataset", |
|
|
private=True |
|
|
) |
|
|
except Exception: |
|
|
pass |
|
|
except Exception as e: |
|
|
print(f" Repo note: {e}") |
|
|
|
|
|
|
|
|
train_dataset.reset_format() |
|
|
eval_dataset.reset_format() |
|
|
|
|
|
|
|
|
print(f" Verifying tokenized data...") |
|
|
print(f" Train columns: {train_dataset.column_names}") |
|
|
print(f" Sample input_ids type: {type(train_dataset[0]['input_ids'])}") |
|
|
print(f" Sample input_ids length: {len(train_dataset[0]['input_ids'])}") |
|
|
print(f" First 10 tokens: {train_dataset[0]['input_ids'][:10]}") |
|
|
|
|
|
|
|
|
print(f" Pushing train split ({len(train_dataset):,} examples)...") |
|
|
train_dataset.push_to_hub( |
|
|
TOKENIZED_DATASET_REPO, |
|
|
split="train", |
|
|
) |
|
|
|
|
|
print(f" Pushing test split ({len(eval_dataset):,} examples)...") |
|
|
eval_dataset.push_to_hub( |
|
|
TOKENIZED_DATASET_REPO, |
|
|
split="test", |
|
|
) |
|
|
|
|
|
print(f"\n" + "=" * 60) |
|
|
print(f"SUCCESS! Tokenized dataset saved to:") |
|
|
print(f" https://huggingface.co/datasets/{TOKENIZED_DATASET_REPO}") |
|
|
print(f"=" * 60) |
|
|
|
|
|
|
|
|
print("\nVerifying upload...") |
|
|
try: |
|
|
from datasets import load_dataset as verify_load |
|
|
verify_ds = verify_load(TOKENIZED_DATASET_REPO, split="train", streaming=True) |
|
|
sample = next(iter(verify_ds)) |
|
|
if "input_ids" in sample: |
|
|
print(f" VERIFIED: Dataset contains input_ids with {len(sample['input_ids'])} tokens") |
|
|
else: |
|
|
print(f" WARNING: input_ids not found in columns: {list(sample.keys())}") |
|
|
except Exception as ve: |
|
|
print(f" Could not verify: {ve}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|