| _wandb: | |
| value: | |
| cli_version: 0.20.1 | |
| m: | |
| - "1": gpu/memory_reserved_gb | |
| "6": | |
| - 3 | |
| "7": [] | |
| - "1": gpu/max_memory_allocated_gb | |
| "6": | |
| - 3 | |
| "7": [] | |
| - "1": gpu/memory_allocated_gb | |
| "6": | |
| - 3 | |
| "7": [] | |
| python_version: 3.11.10 | |
| t: | |
| "1": | |
| - 1 | |
| - 11 | |
| - 41 | |
| - 49 | |
| - 71 | |
| "2": | |
| - 1 | |
| - 11 | |
| - 41 | |
| - 49 | |
| - 71 | |
| "3": | |
| - 7 | |
| - 13 | |
| - 16 | |
| - 55 | |
| - 61 | |
| "4": 3.11.10 | |
| "5": 0.20.1 | |
| "6": 4.52.4 | |
| "12": 0.20.1 | |
| "13": linux-x86_64 | |
| act_fn: | |
| value: relu | |
| batch_size: | |
| value: 8192 | |
| before_ln: | |
| value: false | |
| c_coeff: | |
| value: 4 | |
| cooldown_start_frac: | |
| value: 0.8 | |
| d_feature: | |
| value: 163840 | |
| d_model: | |
| value: 5120 | |
| device: | |
| value: cuda:5 | |
| initial_lr: | |
| value: 0.0002 | |
| layer_idx: | |
| value: 0 | |
| lr: | |
| value: 0.0002 | |
| min_lr_ratio: | |
| value: 0 | |
| model_name: | |
| value: Qwen/Qwen3-14B | |
| model_type: | |
| value: qwen | |
| n_batches: | |
| value: 76 | |
| n_grad_steps: | |
| value: 4 | |
| n_steps: | |
| value: 122070 | |
| preact_coeff: | |
| value: 6e-05 | |
| skip_connections: | |
| value: false | |
| sparsity_coeff_final: | |
| value: 12 | |
| x_scale: | |
| value: 1 | |
| y_scale: | |
| value: 1 | |