File size: 4,954 Bytes
7437c6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
data:
  data_path: ../data/msmarco/parsed_hard_ids_10p_train.jsonl
  dataset_seed: 42
  max_block_length: 160
  max_seq_length: 6144
  num_documents: 30
  qrels_path: null
  streaming: false
  train_test_split: 0.99
  val_data_path: null
model:
  attn_implementation: default_blockrank
  lora_alpha: -1
  lora_dropout: 0.0
  lora_r: -1
  lora_target_modules: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj
  model_name_or_path: /workspace/nilesh_work/hf_cache/Mistral-7B-Instruct-v0.3
  trust_remote_code: false
  use_4bit: false
  use_blockrank: true
  use_lora: false
training:
  accelerator_config:
    dispatch_batches: null
    even_batches: true
    gradient_accumulation_kwargs: null
    non_blocking: false
    split_batches: false
    use_seedable_sampler: true
  activation_offloading: false
  adafactor: false
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_epsilon: 1.0e-08
  assistant_only_loss: false
  auto_find_batch_size: false
  aux_layer_idx: 20
  aux_loss_weight: 0.1
  aux_temperature: 0.05
  average_tokens_across_devices: true
  batch_eval_metrics: false
  bf16: true
  bf16_full_eval: false
  chat_template_path: null
  completion_only_loss: null
  data_seed: null
  dataloader_drop_last: false
  dataloader_num_workers: 0
  dataloader_persistent_workers: false
  dataloader_pin_memory: true
  dataloader_prefetch_factor: null
  dataset_kwargs:
    skip_prepare_dataset: true
  dataset_num_proc: null
  dataset_text_field: text
  ddp_backend: null
  ddp_broadcast_buffers: null
  ddp_bucket_cap_mb: null
  ddp_find_unused_parameters: null
  ddp_timeout: 1800
  debug: []
  deepspeed: null
  disable_tqdm: false
  do_eval: true
  do_predict: false
  do_train: false
  eos_token: <EOS_TOKEN>
  eval_accumulation_steps: null
  eval_delay: 0
  eval_do_concat_batches: true
  eval_on_start: false
  eval_packing: null
  eval_steps: 500
  eval_strategy: 'no'
  eval_use_gather_object: false
  evaluation_strategy: steps
  fp16: false
  fp16_backend: auto
  fp16_full_eval: false
  fp16_opt_level: O1
  fsdp: []
  fsdp_config:
    min_num_params: 0
    xla: false
    xla_fsdp_grad_ckpt: false
    xla_fsdp_v2: false
  fsdp_min_num_params: 0
  fsdp_transformer_layer_cls_to_wrap: null
  full_determinism: false
  gradient_accumulation_steps: 4
  gradient_checkpointing: true
  gradient_checkpointing_kwargs: null
  greater_is_better: false
  group_by_length: false
  half_precision_backend: auto
  hub_always_push: false
  hub_model_id: null
  hub_private_repo: null
  hub_revision: null
  hub_strategy: every_save
  hub_token: <HUB_TOKEN>
  ignore_data_skip: false
  include_for_metrics: []
  include_inputs_for_metrics: false
  include_num_input_tokens_seen: 'no'
  include_tokens_per_second: false
  jit_mode_eval: false
  label_names: null
  label_smoothing_factor: 0.0
  learning_rate: 3.0e-06
  length_column_name: length
  liger_kernel_config: null
  load_best_model_at_end: false
  local_rank: 0
  log_level: passive
  log_level_replica: warning
  log_on_each_node: true
  logging_dir: ../outputs/blockrank-with-aux-loss-mistral-7b-icr-medium_hard_ids-full/runs/Nov03_04-06-43_06353250b0cb
  logging_first_step: true
  logging_nan_inf_filter: true
  logging_steps: 25
  logging_strategy: steps
  loss_type: nll
  lr_scheduler_kwargs: {}
  lr_scheduler_type: cosine
  max_grad_norm: 1.0
  max_length: 1024
  max_steps: -1
  metric_for_best_model: eval_loss
  model_init_kwargs: null
  mp_parameters: ''
  neftune_noise_alpha: null
  no_cuda: false
  num_train_epochs: 1
  optim: adamw_8bit
  optim_args: null
  optim_target_modules: null
  output_dir: ../outputs/blockrank-with-aux-loss-mistral-7b-icr-medium_hard_ids-full
  overwrite_output_dir: false
  packing: false
  packing_strategy: bfd
  pad_to_multiple_of: null
  pad_token: <PAD_TOKEN>
  padding_free: false
  parallelism_config: null
  past_index: -1
  per_device_eval_batch_size: 1
  per_device_train_batch_size: 1
  per_gpu_eval_batch_size: null
  per_gpu_train_batch_size: null
  prediction_loss_only: false
  project: huggingface
  push_to_hub: false
  push_to_hub_model_id: null
  push_to_hub_organization: null
  push_to_hub_token: <PUSH_TO_HUB_TOKEN>
  ray_scope: last
  remove_unused_columns: false
  report_to:
  - wandb
  restore_callback_states_from_checkpoint: false
  resume_from_checkpoint: null
  run_name: blockrank-with-aux-loss-mistral-7b-icr-medium_hard_ids-full
  save_on_each_node: false
  save_only_model: false
  save_safetensors: true
  save_steps: 500
  save_strategy: 'no'
  save_total_limit: 1
  seed: 42
  skip_memory_metrics: true
  tf32: null
  torch_compile: false
  torch_compile_backend: null
  torch_compile_mode: null
  torch_empty_cache_steps: null
  torchdynamo: null
  tpu_metrics_debug: false
  tpu_num_cores: null
  trackio_space_id: trackio
  use_aux_loss: true
  use_cpu: false
  use_legacy_prediction_loop: false
  use_liger_kernel: false
  use_mps_device: false
  warmup_ratio: 0.01
  warmup_steps: 0
  weight_decay: 0