sudoping01
/

bambara-llm-exp3

Text Generation

image-text-to-text

Model card Files Files and versions

sudoping01 commited on Sep 2

Commit

012c5a3

·

verified ·

1 Parent(s): 4343e11

Update README.md

Files changed (1) hide show

README.md +8 -8

README.md CHANGED Viewed

@@ -28,7 +28,7 @@ hub_model_id: sudoping01/bambara-llm-exp3
 plugins:
   - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 cut_cross_entropy: true
-load_in_4bit: false  # Changed: Use LoRA instead of QLoRA for better quality
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
@@ -49,26 +49,26 @@ datasets:
       content: content
 val_set_size: 0.01
 output_dir: ./outputs/bambara-gemma3n-lora-exp4
-adapter: lora  # Changed: LoRA instead of QLoRA
-lora_r: 64     # Increased: Higher rank for better capacity
-lora_alpha: 128 # Increased: 2x the rank is a good starting point
 lora_dropout: 0.05
 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
 sequence_len: 4096
 sample_packing: false
 pad_to_sequence_len: false
-micro_batch_size: 8  # Increased: You have 8x H100s, can handle larger batches
 gradient_accumulation_steps: 2
-num_epochs: 3  # Reduced: Start conservative with 1M samples
 optimizer: adamw_8bit
 lr_scheduler: cosine
-learning_rate: 1.2e-4  # Changed: Your friend's suggestion for 1M samples on 7B model
 warmup_ratio: 0.03
 weight_decay: 0.01
 bf16: auto
 tf32: false
 logging_steps: 10
-saves_per_epoch: 2  # Increased: More checkpoints for 1M samples
 evals_per_epoch: 2
 ```

 plugins:
   - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 cut_cross_entropy: true
+load_in_4bit: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
       content: content
 val_set_size: 0.01
 output_dir: ./outputs/bambara-gemma3n-lora-exp4
+adapter: lora
+lora_r: 64
+lora_alpha: 128
 lora_dropout: 0.05
 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
 sequence_len: 4096
 sample_packing: false
 pad_to_sequence_len: false
+micro_batch_size: 8
 gradient_accumulation_steps: 2
+num_epochs: 3
 optimizer: adamw_8bit
 lr_scheduler: cosine
+learning_rate: 1.2e-4
 warmup_ratio: 0.03
 weight_decay: 0.01
 bf16: auto
 tf32: false
 logging_steps: 10
+saves_per_epoch: 2
 evals_per_epoch: 2
 ```