tim-lawson commited on
Commit
c9bfa01
·
verified ·
1 Parent(s): 1191545

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. train_config.json +76 -1
train_config.json CHANGED
@@ -1 +1,76 @@
1
- {"data": {"train_files": "data/fineweb_10B_gpt2/fineweb_train_*.bin", "train_tokens": null, "val_files": "data/fineweb_10B_gpt2/fineweb_val_*.bin", "val_tokens": null, "batch_size": 512, "device_batch_size": 32}, "model": {"dim": 768, "n_layers": 12, "n_heads": 12, "n_kv_heads": 12, "vocab_size": 50257, "multiple_of": 256, "ffn_dim_multiplier": 4, "norm_eps": 1e-05, "rope_theta": 10000, "use_scaled_rope": false, "max_seq_len": 1024, "initializer_range": 0.02, "zero_init_masks": false}, "optimizer": {"default": {"lr": 0.001, "beta1": 0.8, "beta2": 0.95, "eps": 1e-10, "weight_decay": 0}, "masks": {"lr": 0.001, "beta1": 0.8, "beta2": 0.95, "eps": 1e-10, "weight_decay": 0}, "norms": {"lr": 0.001, "beta1": 0.8, "beta2": 0.95, "eps": 1e-10, "weight_decay": 0}}, "scheduler": {"warmup_steps": 0.1, "start_factor": 0.1}, "gates": {"mean_targets": "auto", "mean_target_start": 1.0, "mean_target_end": 0.3, "var_target_delta": 0, "mean_coef_init": 0, "var_coef_init": 0, "coef_max": Infinity, "coef_min": -Infinity, "ema_steps_short": 1, "ema_steps_long": 10, "delta_min": 0.01, "coef_update_multiplier": 0.001}, "gates_zero_eps": 1e-08, "seed": 0, "project": "fineweb-gated", "run_id": null, "logdir": "logs/fineweb-gated", "log_gradients": false, "log_params": false, "log_every_steps": 10, "val_every_steps": -1, "save_every_steps": -1}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data": {
3
+ "train_files": "data/fineweb_10B_gpt2/fineweb_train_*.bin",
4
+ "train_tokens": null,
5
+ "val_files": "data/fineweb_10B_gpt2/fineweb_val_*.bin",
6
+ "val_tokens": null,
7
+ "batch_size": 512,
8
+ "device_batch_size": 32
9
+ },
10
+ "model": {
11
+ "dim": 768,
12
+ "n_layers": 12,
13
+ "n_heads": 12,
14
+ "n_kv_heads": 12,
15
+ "vocab_size": 50257,
16
+ "multiple_of": 256,
17
+ "ffn_dim_multiplier": 4,
18
+ "norm_eps": 1e-05,
19
+ "rope_theta": 10000,
20
+ "use_scaled_rope": false,
21
+ "max_seq_len": 1024,
22
+ "initializer_range": 0.02,
23
+ "zero_init_masks": false
24
+ },
25
+ "optimizer": {
26
+ "default": {
27
+ "lr": 0.001,
28
+ "beta1": 0.8,
29
+ "beta2": 0.95,
30
+ "eps": 1e-10,
31
+ "weight_decay": 0
32
+ },
33
+ "masks": {
34
+ "lr": 0.001,
35
+ "beta1": 0.8,
36
+ "beta2": 0.95,
37
+ "eps": 1e-10,
38
+ "weight_decay": 0
39
+ },
40
+ "norms": {
41
+ "lr": 0.001,
42
+ "beta1": 0.8,
43
+ "beta2": 0.95,
44
+ "eps": 1e-10,
45
+ "weight_decay": 0
46
+ }
47
+ },
48
+ "scheduler": {
49
+ "warmup_steps": 0.1,
50
+ "start_factor": 0.1
51
+ },
52
+ "gates": {
53
+ "mean_targets": "auto",
54
+ "mean_target_start": 1.0,
55
+ "mean_target_end": 0.3,
56
+ "var_target_delta": 0,
57
+ "mean_coef_init": 0,
58
+ "var_coef_init": 0,
59
+ "coef_max": "+inf",
60
+ "coef_min": "-inf",
61
+ "ema_steps_short": 1,
62
+ "ema_steps_long": 10,
63
+ "delta_min": 0.01,
64
+ "coef_update_multiplier": 0.001
65
+ },
66
+ "gates_zero_eps": 1e-08,
67
+ "seed": 0,
68
+ "project": "fineweb-gated",
69
+ "run_id": null,
70
+ "logdir": "logs/fineweb-gated",
71
+ "log_gradients": false,
72
+ "log_params": false,
73
+ "log_every_steps": 10,
74
+ "val_every_steps": -1,
75
+ "save_every_steps": -1
76
+ }