|  | { | 
					
						
						|  | "pipe-parallel-size": 1, | 
					
						
						|  | "model-parallel-size": 1, | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | "num-layers": 24, | 
					
						
						|  | "hidden-size": 2048, | 
					
						
						|  | "num-attention-heads": 16, | 
					
						
						|  | "seq-length": 2048, | 
					
						
						|  | "max-position-embeddings": 2048, | 
					
						
						|  | "pos-emb": "rotary", | 
					
						
						|  | "rotary-pct": 0.25, | 
					
						
						|  | "no-weight-tying": true, | 
					
						
						|  | "gpt-j-residual": true, | 
					
						
						|  | "output-layer-parallelism": "column", | 
					
						
						|  |  | 
					
						
						|  | "scaled-upper-triang-masked-softmax-fusion": true, | 
					
						
						|  | "bias-gelu-fusion": true, | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | "init_method": "small_init", | 
					
						
						|  | "output_layer_init_method": "wang_init", | 
					
						
						|  |  | 
					
						
						|  | "optimizer": { | 
					
						
						|  | "type": "Adam", | 
					
						
						|  | "params": { | 
					
						
						|  | "lr": 0.0002, | 
					
						
						|  | "betas": [0.9, 0.95], | 
					
						
						|  | "eps": 1.0e-8, | 
					
						
						|  | } | 
					
						
						|  | }, | 
					
						
						|  | "min_lr": 0.00002, | 
					
						
						|  |  | 
					
						
						|  | "zero_optimization": { | 
					
						
						|  | "stage": 1, | 
					
						
						|  | "allgather_partitions": True, | 
					
						
						|  | "allgather_bucket_size": 500000000, | 
					
						
						|  | "overlap_comm": True, | 
					
						
						|  | "reduce_scatter": True, | 
					
						
						|  | "reduce_bucket_size": 500000000, | 
					
						
						|  | "contiguous_gradients": True, | 
					
						
						|  | "cpu_offload": False | 
					
						
						|  | }, | 
					
						
						|  |  | 
					
						
						|  | "train_micro_batch_size_per_gpu": 16, | 
					
						
						|  | "gas": 1, | 
					
						
						|  | "data-impl": "mmap", | 
					
						
						|  | "num_workers": 1, | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | "checkpoint-activations": true, | 
					
						
						|  | "checkpoint-num-layers": 1, | 
					
						
						|  | "partition-activations": true, | 
					
						
						|  | "synchronize-each-layer": true, | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | "gradient_clipping": 1.0, | 
					
						
						|  | "weight-decay": 0.1, | 
					
						
						|  | "hidden-dropout": 0, | 
					
						
						|  | "attention-dropout": 0, | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | "fp16": { | 
					
						
						|  | "fp16": true, | 
					
						
						|  | "enabled": true, | 
					
						
						|  | "loss_scale": 0, | 
					
						
						|  | "loss_scale_window": 1000, | 
					
						
						|  | "initial_scale_power": 12, | 
					
						
						|  | "hysteresis": 2, | 
					
						
						|  | "min_loss_scale": 1, | 
					
						
						|  | }, | 
					
						
						|  |  | 
					
						
						|  | "train-iters": 71500, | 
					
						
						|  | "lr-decay-iters": 71250, | 
					
						
						|  | "distributed-backend": "nccl", | 
					
						
						|  | "lr-decay-style": "cosine", | 
					
						
						|  | "warmup": 0.01, | 
					
						
						|  | "save-interval": 250, | 
					
						
						|  | "eval-interval": 71500, | 
					
						
						|  | "eval-iters": 10, | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | "log-interval": 10, | 
					
						
						|  | "steps_per_print": 10, | 
					
						
						|  | "wall_clock_breakdown": true, | 
					
						
						|  |  | 
					
						
						|  | "save": "/fsx/hailey/pythia/ckpts/1.3B", | 
					
						
						|  | "load": "/fsx/hailey/pythia/ckpts/1.3B", | 
					
						
						|  |  | 
					
						
						|  | "train-data-paths": ["/fsx/pile/pile_20B_tokenizer_text_document"], | 
					
						
						|  | "valid-data-paths": ["/fsx/pile/pile_20B_tokenizer_text_document"], | 
					
						
						|  | "test-data-paths": ["/fsx/pile/pile_20B_tokenizer_text_document"], | 
					
						
						|  |  | 
					
						
						|  | "tokenizer-type": "HFTokenizer", | 
					
						
						|  | "vocab-file": "/fsx/pile/20B_tokenizer.json", | 
					
						
						|  |  | 
					
						
						|  | "tensorboard-dir": "/fsx/code-fim/FIMlogs/1.3B-AR-Pile-9-6-22-rotary-1MtokBS", | 
					
						
						|  | "log-dir": "/fsx/code-fim/FIMlogs/1.3B-AR-Pile-9-6-22-rotary-1MtokBS", | 
					
						
						|  |  | 
					
						
						|  | "use_wandb": True, | 
					
						
						|  | "wandb_group": "Pythia 1.3B", | 
					
						
						|  | "wandb_team": "eleutherai", | 
					
						
						|  | "wandb_project": "pythia", | 
					
						
						|  |  | 
					
						
						|  | "launcher": "openmpi", | 
					
						
						|  | "deepspeed_mpi": true, | 
					
						
						|  | } | 
					
						
						|  |  |