# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "transformers>=4.50.0",
#     "datasets>=2.14.0",
#     "huggingface_hub",
# ]
# ///
"""
Tokenize Dataset Script: Prepare Tool Calling Dataset for Training

This script tokenizes the nvidia/Nemotron-Agentic-v1 tool_calling dataset
and uploads it to HuggingFace Hub for reuse.

Usage:
    uv run tokenize_dataset.py

Can run on CPU - no GPU required!
"""

import os
import json
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download, HfApi, create_repo

# ============================================================================
# CONFIGURATION
# ============================================================================

# Model to get tokenizer from
BASE_MODEL = "Tesslate/Synthia-S1-27b"

# Source dataset
DATASET_NAME = "nvidia/Nemotron-Agentic-v1"
DATASET_SPLIT = "tool_calling"

# Output tokenized dataset
TOKENIZED_DATASET_REPO = "Codyfederer/synthia-tool-calling-tokenized"
TOKENIZED_DATASET_PRIVATE = True

# Tokenization settings
MAX_SEQ_LENGTH = 4096

# ============================================================================
# TOKENIZATION FUNCTIONS
# ============================================================================

def tokenize_conversation(example, tokenizer, max_length):
    """
    Tokenize a conversation using the model's chat template.
    Returns input_ids, attention_mask, and labels for causal LM training.
    """
    messages = example["messages"]

    # Apply chat template to get the full text
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )

    # Tokenize the text
    tokenized = tokenizer(
        text,
        truncation=True,
        max_length=max_length,
        padding=False,
        return_tensors=None,
    )

    # For causal LM, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized


def main():
    print("=" * 60)
    print("Tokenize Dataset for Tool Calling Training")
    print("=" * 60)

    # Get HF username
    from huggingface_hub import whoami
    try:
        username = whoami()["name"]
        print(f"Logged in as: {username}")
    except Exception as e:
        print(f"ERROR: Not logged in to HF Hub ({e})")
        print("Run 'huggingface-cli login' first")
        return

    # -------------------------------------------------------------------------
    # Load Tokenizer
    # -------------------------------------------------------------------------
    print(f"\nLoading tokenizer from {BASE_MODEL}...")

    tokenizer = AutoTokenizer.from_pretrained(
        BASE_MODEL,
        trust_remote_code=True,
        padding_side="right",
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    print(f"Vocab size: {len(tokenizer):,}")

    # -------------------------------------------------------------------------
    # Load Source Dataset
    # -------------------------------------------------------------------------
    print(f"\nLoading dataset: {DATASET_NAME} ({DATASET_SPLIT} split)...")

    # Download the JSONL file
    jsonl_file = f"data/{DATASET_SPLIT}.jsonl"
    print(f"Downloading {jsonl_file}...")

    local_path = hf_hub_download(
        repo_id=DATASET_NAME,
        filename=jsonl_file,
        repo_type="dataset"
    )
    print(f"Downloaded to: {local_path}")

    # Load and process JSONL
    print("Loading and processing JSONL file...")
    processed_examples = []
    skipped = 0

    with open(local_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num % 50000 == 0:
                print(f"  Processed {line_num:,} lines...")
            try:
                example = json.loads(line.strip())
                messages = example.get("messages", [])

                # Convert messages to consistent format
                formatted_messages = []
                for msg in messages:
                    role = msg.get("role", "user")
                    content = msg.get("content", "")

                    # Handle content that might be a list or complex object
                    if isinstance(content, list):
                        parts = []
                        for item in content:
                            if isinstance(item, dict):
                                if "text" in item:
                                    parts.append(item["text"])
                                else:
                                    parts.append(json.dumps(item))
                            else:
                                parts.append(str(item))
                        content = "\n".join(parts) if parts else ""
                    elif isinstance(content, dict):
                        content = json.dumps(content)
                    elif content is None:
                        content = ""
                    else:
                        content = str(content)

                    formatted_messages.append({
                        "role": role,
                        "content": content
                    })

                # Merge consecutive messages with same role
                if formatted_messages:
                    merged_messages = []
                    for msg in formatted_messages:
                        role = msg["role"]
                        content = msg["content"]

                        # Map tool role to user
                        if role == "tool":
                            role = "user"
                            content = f"[Tool Result]\n{content}"

                        if merged_messages and merged_messages[-1]["role"] == role:
                            merged_messages[-1]["content"] += f"\n\n{content}"
                        else:
                            merged_messages.append({"role": role, "content": content})

                    # Ensure conversation starts with user
                    if merged_messages and merged_messages[0]["role"] != "user":
                        merged_messages.insert(0, {"role": "user", "content": "[Start]"})

                    processed_examples.append({"messages": merged_messages})

            except Exception as e:
                skipped += 1
                if skipped < 5:
                    print(f"  Warning: Skipped line {line_num}: {e}")

    print(f"Loaded {len(processed_examples):,} examples (skipped {skipped})")

    # Create dataset
    dataset = Dataset.from_list(processed_examples)
    print(f"Dataset size: {len(dataset):,} examples")

    # Create train/eval split
    split_dataset = dataset.train_test_split(test_size=0.02, seed=42)
    train_dataset = split_dataset["train"]
    eval_dataset = split_dataset["test"]

    print(f"Train samples: {len(train_dataset):,}")
    print(f"Eval samples: {len(eval_dataset):,}")

    # -------------------------------------------------------------------------
    # Tokenize Dataset
    # -------------------------------------------------------------------------
    print(f"\nTokenizing dataset with max_length={MAX_SEQ_LENGTH}...")
    print("This may take a while for large datasets...")

    train_dataset = train_dataset.map(
        lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH),
        remove_columns=["messages"],
        num_proc=1,  # Use single process to reduce memory
        desc="Tokenizing train",
    )

    eval_dataset = eval_dataset.map(
        lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH),
        remove_columns=["messages"],
        num_proc=1,  # Use single process to reduce memory
        desc="Tokenizing eval",
    )

    print(f"Tokenization complete!")
    print(f"Train dataset columns: {train_dataset.column_names}")
    print(f"Sample input_ids length: {len(train_dataset[0]['input_ids'])}")

    # -------------------------------------------------------------------------
    # Upload to Hub
    # -------------------------------------------------------------------------
    print(f"\nUploading TOKENIZED dataset to Hub: {TOKENIZED_DATASET_REPO}")

    # Create repo
    api = HfApi()
    try:
        create_repo(
            TOKENIZED_DATASET_REPO,
            repo_type="dataset",
            private=TOKENIZED_DATASET_PRIVATE,
            exist_ok=True
        )
        print(f"  Created/verified repo (private={TOKENIZED_DATASET_PRIVATE})")

        if TOKENIZED_DATASET_PRIVATE:
            try:
                api.update_repo_visibility(
                    TOKENIZED_DATASET_REPO,
                    repo_type="dataset",
                    private=True
                )
            except Exception:
                pass
    except Exception as e:
        print(f"  Repo note: {e}")

    # Reset format for serialization
    train_dataset.reset_format()
    eval_dataset.reset_format()

    # Verify data
    print(f"  Verifying tokenized data...")
    print(f"    Train columns: {train_dataset.column_names}")
    print(f"    Sample input_ids type: {type(train_dataset[0]['input_ids'])}")
    print(f"    Sample input_ids length: {len(train_dataset[0]['input_ids'])}")
    print(f"    First 10 tokens: {train_dataset[0]['input_ids'][:10]}")

    # Push to Hub
    print(f"  Pushing train split ({len(train_dataset):,} examples)...")
    train_dataset.push_to_hub(
        TOKENIZED_DATASET_REPO,
        split="train",
    )

    print(f"  Pushing test split ({len(eval_dataset):,} examples)...")
    eval_dataset.push_to_hub(
        TOKENIZED_DATASET_REPO,
        split="test",
    )

    print(f"\n" + "=" * 60)
    print(f"SUCCESS! Tokenized dataset saved to:")
    print(f"  https://huggingface.co/datasets/{TOKENIZED_DATASET_REPO}")
    print(f"=" * 60)

    # Verify upload
    print("\nVerifying upload...")
    try:
        from datasets import load_dataset as verify_load
        verify_ds = verify_load(TOKENIZED_DATASET_REPO, split="train", streaming=True)
        sample = next(iter(verify_ds))
        if "input_ids" in sample:
            print(f"  VERIFIED: Dataset contains input_ids with {len(sample['input_ids'])} tokens")
        else:
            print(f"  WARNING: input_ids not found in columns: {list(sample.keys())}")
    except Exception as ve:
        print(f"  Could not verify: {ve}")


if __name__ == "__main__":
    main()