sudoping01 commited on
Commit
012c5a3
·
verified ·
1 Parent(s): 4343e11

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +8 -8
README.md CHANGED
@@ -28,7 +28,7 @@ hub_model_id: sudoping01/bambara-llm-exp3
28
  plugins:
29
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
30
  cut_cross_entropy: true
31
- load_in_4bit: false # Changed: Use LoRA instead of QLoRA for better quality
32
  gradient_checkpointing: true
33
  gradient_checkpointing_kwargs:
34
  use_reentrant: false
@@ -49,26 +49,26 @@ datasets:
49
  content: content
50
  val_set_size: 0.01
51
  output_dir: ./outputs/bambara-gemma3n-lora-exp4
52
- adapter: lora # Changed: LoRA instead of QLoRA
53
- lora_r: 64 # Increased: Higher rank for better capacity
54
- lora_alpha: 128 # Increased: 2x the rank is a good starting point
55
  lora_dropout: 0.05
56
  lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
57
  sequence_len: 4096
58
  sample_packing: false
59
  pad_to_sequence_len: false
60
- micro_batch_size: 8 # Increased: You have 8x H100s, can handle larger batches
61
  gradient_accumulation_steps: 2
62
- num_epochs: 3 # Reduced: Start conservative with 1M samples
63
  optimizer: adamw_8bit
64
  lr_scheduler: cosine
65
- learning_rate: 1.2e-4 # Changed: Your friend's suggestion for 1M samples on 7B model
66
  warmup_ratio: 0.03
67
  weight_decay: 0.01
68
  bf16: auto
69
  tf32: false
70
  logging_steps: 10
71
- saves_per_epoch: 2 # Increased: More checkpoints for 1M samples
72
  evals_per_epoch: 2
73
  ```
74
 
 
28
  plugins:
29
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
30
  cut_cross_entropy: true
31
+ load_in_4bit: false
32
  gradient_checkpointing: true
33
  gradient_checkpointing_kwargs:
34
  use_reentrant: false
 
49
  content: content
50
  val_set_size: 0.01
51
  output_dir: ./outputs/bambara-gemma3n-lora-exp4
52
+ adapter: lora
53
+ lora_r: 64
54
+ lora_alpha: 128
55
  lora_dropout: 0.05
56
  lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
57
  sequence_len: 4096
58
  sample_packing: false
59
  pad_to_sequence_len: false
60
+ micro_batch_size: 8
61
  gradient_accumulation_steps: 2
62
+ num_epochs: 3
63
  optimizer: adamw_8bit
64
  lr_scheduler: cosine
65
+ learning_rate: 1.2e-4
66
  warmup_ratio: 0.03
67
  weight_decay: 0.01
68
  bf16: auto
69
  tf32: false
70
  logging_steps: 10
71
+ saves_per_epoch: 2
72
  evals_per_epoch: 2
73
  ```
74