Codyfederer commited on
Commit
fe42dd1
·
verified ·
1 Parent(s): 5d5708e

Upload train_h100.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_h100.py +36 -6
train_h100.py CHANGED
@@ -17,11 +17,12 @@ Optimized for H100 80GB
17
  """
18
 
19
  import os
 
 
20
  from datasets import load_dataset
21
  from transformers import (
22
  AutoTokenizer,
23
  AutoModelForCausalLM,
24
- DataCollatorForLanguageModeling,
25
  Trainer,
26
  TrainingArguments,
27
  )
@@ -30,6 +31,38 @@ import torch
30
  import trackio
31
  from huggingface_hub import whoami
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # Configuration
34
  BASE_MODEL = "Tesslate/Synthia-S1-27b"
35
  OUTPUT_MODEL = "Synthia-S1-27b-tool-calling"
@@ -148,7 +181,7 @@ training_args = TrainingArguments(
148
  report_to="trackio",
149
  run_name=f"synthia-tool-calling-lora-r{LORA_R}",
150
  bf16=True,
151
- dataloader_num_workers=4,
152
  dataloader_pin_memory=True,
153
  seed=42,
154
  remove_unused_columns=False,
@@ -156,10 +189,7 @@ training_args = TrainingArguments(
156
 
157
  # Initialize trainer
158
  print("\nInitializing trainer...")
159
- data_collator = DataCollatorForLanguageModeling(
160
- tokenizer=tokenizer,
161
- mlm=False,
162
- )
163
 
164
  trainer = Trainer(
165
  model=model,
 
17
  """
18
 
19
  import os
20
+ from dataclasses import dataclass
21
+ from typing import Any, Dict, List
22
  from datasets import load_dataset
23
  from transformers import (
24
  AutoTokenizer,
25
  AutoModelForCausalLM,
 
26
  Trainer,
27
  TrainingArguments,
28
  )
 
31
  import trackio
32
  from huggingface_hub import whoami
33
 
34
+
35
+ @dataclass
36
+ class DataCollatorForPreTokenized:
37
+ """Data collator for pre-tokenized datasets with padding."""
38
+ pad_token_id: int
39
+
40
+ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
41
+ # Find max length in batch
42
+ max_length = max(len(f["input_ids"]) for f in features)
43
+
44
+ batch = {
45
+ "input_ids": [],
46
+ "attention_mask": [],
47
+ "labels": [],
48
+ }
49
+
50
+ for feature in features:
51
+ input_ids = feature["input_ids"]
52
+ attention_mask = feature["attention_mask"]
53
+ labels = feature.get("labels", input_ids.copy())
54
+
55
+ # Calculate padding needed
56
+ padding_length = max_length - len(input_ids)
57
+
58
+ # Pad sequences (right padding)
59
+ batch["input_ids"].append(input_ids + [self.pad_token_id] * padding_length)
60
+ batch["attention_mask"].append(attention_mask + [0] * padding_length)
61
+ batch["labels"].append(labels + [-100] * padding_length) # -100 is ignored by loss
62
+
63
+ # Convert to tensors
64
+ return {k: torch.tensor(v, dtype=torch.long) for k, v in batch.items()}
65
+
66
  # Configuration
67
  BASE_MODEL = "Tesslate/Synthia-S1-27b"
68
  OUTPUT_MODEL = "Synthia-S1-27b-tool-calling"
 
181
  report_to="trackio",
182
  run_name=f"synthia-tool-calling-lora-r{LORA_R}",
183
  bf16=True,
184
+ dataloader_num_workers=0, # Avoid multiprocessing issues with custom collator
185
  dataloader_pin_memory=True,
186
  seed=42,
187
  remove_unused_columns=False,
 
189
 
190
  # Initialize trainer
191
  print("\nInitializing trainer...")
192
+ data_collator = DataCollatorForPreTokenized(pad_token_id=tokenizer.pad_token_id)
 
 
 
193
 
194
  trainer = Trainer(
195
  model=model,