diff --git "a/training.log" "b/training.log" new file mode 100644--- /dev/null +++ "b/training.log" @@ -0,0 +1,2254 @@ +[2025-07-10 15:48:28,066] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-07-10 15:48:29,737] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +[2025-07-10 15:48:30,494] [INFO] [runner.py:499:main] Using IP address of 192.168.100.12 for node worker-1 +[2025-07-10 15:48:30,496] [INFO] [multinode_runner.py:85:get_cmd] Running on the following workers: worker-1,worker-2 +[2025-07-10 15:48:30,496] [INFO] [runner.py:610:main] cmd = pdsh -S -f 1024 -w worker-1,worker-2 export NCCL_DEBUG_SUBSYS=ALL; export NCCL_DEBUG=WARN; export NCCL_TIMEOUT=6000; export PYTHONPATH=/root/workspace/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning; cd /root/workspace/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning; /root/workspace/DeepSpeedExamples/.venv/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJ3b3JrZXItMSI6IFswLCAxLCAyLCAzLCA0LCA1LCA2LCA3XSwgIndvcmtlci0yIjogWzAsIDEsIDIsIDMsIDQsIDUsIDYsIDddfQ== --node_rank=%n --master_addr=192.168.100.12 --master_port=29500 --enable_each_rank_log=None main.py --data_path mncai/foundation_model_smoltalk_ko_translate mncai/foundation_model_smoltalk_zh_translate HuggingFaceTB/smoltalk --data_name default default all --data_split 1,0,0 --model_name_or_path Qwen/Qwen3-0.6B --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --max_seq_len 16384 --learning_rate 9.65e-6 --weight_decay 0. --num_train_epochs 1 --gradient_accumulation_steps 16 --lr_scheduler_type cosine --num_warmup_steps 500 --seed 1234 --gradient_checkpointing --zero_stage 1 --deepspeed --output_dir ./output_step1_qwen3_0.6b --dtype bf16 +worker-1: [2025-07-10 15:48:32,980] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-1: [2025-07-10 15:48:34,574] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-1: [2025-07-10 15:48:34,978] [INFO] [launch.py:139:main] 0 NCCL_DEBUG_SUBSYS=ALL +worker-1: [2025-07-10 15:48:34,978] [INFO] [launch.py:139:main] 0 NCCL_DEBUG=WARN +worker-1: [2025-07-10 15:48:34,978] [INFO] [launch.py:139:main] 0 NCCL_TIMEOUT=6000 +worker-1: [2025-07-10 15:48:34,978] [INFO] [launch.py:146:main] WORLD INFO DICT: {'worker-1': [0, 1, 2, 3, 4, 5, 6, 7], 'worker-2': [0, 1, 2, 3, 4, 5, 6, 7]} +worker-1: [2025-07-10 15:48:34,978] [INFO] [launch.py:152:main] nnodes=2, num_local_procs=8, node_rank=0 +worker-1: [2025-07-10 15:48:34,978] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(, {'worker-1': [0, 1, 2, 3, 4, 5, 6, 7], 'worker-2': [8, 9, 10, 11, 12, 13, 14, 15]}) +worker-1: [2025-07-10 15:48:34,978] [INFO] [launch.py:164:main] dist_world_size=16 +worker-1: [2025-07-10 15:48:34,978] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +worker-1: [2025-07-10 15:48:34,979] [INFO] [launch.py:256:main] process 114644 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=0', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-1: [2025-07-10 15:48:34,979] [INFO] [launch.py:256:main] process 114645 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=1', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-1: [2025-07-10 15:48:34,980] [INFO] [launch.py:256:main] process 114646 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=2', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-1: [2025-07-10 15:48:34,980] [INFO] [launch.py:256:main] process 114647 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=3', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-1: [2025-07-10 15:48:34,981] [INFO] [launch.py:256:main] process 114648 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=4', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-1: [2025-07-10 15:48:34,981] [INFO] [launch.py:256:main] process 114649 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=5', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-1: [2025-07-10 15:48:34,981] [INFO] [launch.py:256:main] process 114650 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=6', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-1: [2025-07-10 15:48:34,982] [INFO] [launch.py:256:main] process 114651 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=7', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-2: [2025-07-10 15:48:36,634] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-2: [2025-07-10 15:48:38,304] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-2: [2025-07-10 15:48:38,708] [INFO] [launch.py:139:main] 1 NCCL_DEBUG_SUBSYS=ALL +worker-2: [2025-07-10 15:48:38,708] [INFO] [launch.py:139:main] 1 NCCL_DEBUG=WARN +worker-2: [2025-07-10 15:48:38,708] [INFO] [launch.py:139:main] 1 NCCL_TIMEOUT=6000 +worker-2: [2025-07-10 15:48:38,708] [INFO] [launch.py:146:main] WORLD INFO DICT: {'worker-1': [0, 1, 2, 3, 4, 5, 6, 7], 'worker-2': [0, 1, 2, 3, 4, 5, 6, 7]} +worker-2: [2025-07-10 15:48:38,708] [INFO] [launch.py:152:main] nnodes=2, num_local_procs=8, node_rank=1 +worker-2: [2025-07-10 15:48:38,708] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(, {'worker-1': [0, 1, 2, 3, 4, 5, 6, 7], 'worker-2': [8, 9, 10, 11, 12, 13, 14, 15]}) +worker-2: [2025-07-10 15:48:38,708] [INFO] [launch.py:164:main] dist_world_size=16 +worker-2: [2025-07-10 15:48:38,708] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +worker-2: [2025-07-10 15:48:38,708] [INFO] [launch.py:256:main] process 19266 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=0', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-2: [2025-07-10 15:48:38,709] [INFO] [launch.py:256:main] process 19267 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=1', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-2: [2025-07-10 15:48:38,709] [INFO] [launch.py:256:main] process 19268 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=2', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-2: [2025-07-10 15:48:38,710] [INFO] [launch.py:256:main] process 19269 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=3', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-2: [2025-07-10 15:48:38,710] [INFO] [launch.py:256:main] process 19270 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=4', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-2: [2025-07-10 15:48:38,711] [INFO] [launch.py:256:main] process 19271 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=5', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-2: [2025-07-10 15:48:38,711] [INFO] [launch.py:256:main] process 19272 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=6', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-2: [2025-07-10 15:48:38,712] [INFO] [launch.py:256:main] process 19273 spawned with command: ['/root/workspace/DeepSpeedExamples/.venv/bin/python3', '-u', 'main.py', '--local_rank=7', '--data_path', 'mncai/foundation_model_smoltalk_ko_translate', 'mncai/foundation_model_smoltalk_zh_translate', 'HuggingFaceTB/smoltalk', '--data_name', 'default', 'default', 'all', '--data_split', '1,0,0', '--model_name_or_path', 'Qwen/Qwen3-0.6B', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '16384', '--learning_rate', '9.65e-6', '--weight_decay', '0.', '--num_train_epochs', '1', '--gradient_accumulation_steps', '16', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '500', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '1', '--deepspeed', '--output_dir', './output_step1_qwen3_0.6b', '--dtype', 'bf16'] +worker-1: [2025-07-10 15:48:42,581] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-1: [2025-07-10 15:48:43,397] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-1: [2025-07-10 15:48:43,441] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-1: [2025-07-10 15:48:43,441] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-1: [2025-07-10 15:48:43,498] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-1: [2025-07-10 15:48:43,498] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-1: [2025-07-10 15:48:43,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-1: [2025-07-10 15:48:43,526] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-1: [2025-07-10 15:48:44,440] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-1: [2025-07-10 15:48:44,852] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-1: [2025-07-10 15:48:44,906] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-1: [2025-07-10 15:48:44,948] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-1: [2025-07-10 15:48:44,955] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-1: [2025-07-10 15:48:45,019] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-1: [2025-07-10 15:48:45,096] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-1: [2025-07-10 15:48:45,386] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-2: [2025-07-10 15:48:45,696] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-2: [2025-07-10 15:48:46,165] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-1: [2025-07-10 15:48:46,619] [INFO] [comm.py:675:init_distributed] cdb=None +worker-1: [2025-07-10 15:48:46,619] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +worker-2: [2025-07-10 15:48:46,831] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-2: [2025-07-10 15:48:46,911] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-2: [2025-07-10 15:48:46,962] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-2: [2025-07-10 15:48:46,963] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-2: [2025-07-10 15:48:47,035] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-2: [2025-07-10 15:48:47,045] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-2: [2025-07-10 15:48:47,055] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) +worker-2: [2025-07-10 15:48:48,146] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-2: [2025-07-10 15:48:48,246] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-2: [2025-07-10 15:48:48,347] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-2: [2025-07-10 15:48:48,370] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-2: [2025-07-10 15:48:48,382] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-2: [2025-07-10 15:48:48,443] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-1: [2025-07-10 15:48:48,492] [INFO] [comm.py:675:init_distributed] cdb=None +worker-1: [2025-07-10 15:48:48,495] [INFO] [comm.py:675:init_distributed] cdb=None +worker-1: [2025-07-10 15:48:48,503] [INFO] [comm.py:675:init_distributed] cdb=None +worker-1: [2025-07-10 15:48:48,546] [INFO] [comm.py:675:init_distributed] cdb=None +worker-1: [2025-07-10 15:48:48,584] [INFO] [comm.py:675:init_distributed] cdb=None +worker-1: [2025-07-10 15:48:48,586] [INFO] [comm.py:675:init_distributed] cdb=None +worker-2: [2025-07-10 15:48:48,805] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False +worker-1: [2025-07-10 15:48:48,935] [INFO] [comm.py:675:init_distributed] cdb=None +worker-2: [2025-07-10 15:48:49,433] [INFO] [comm.py:675:init_distributed] cdb=None +worker-2: [2025-07-10 15:48:50,498] [INFO] [comm.py:675:init_distributed] cdb=None +worker-2: [2025-07-10 15:48:51,846] [INFO] [comm.py:675:init_distributed] cdb=None +worker-2: [2025-07-10 15:48:51,847] [INFO] [comm.py:675:init_distributed] cdb=None +worker-2: [2025-07-10 15:48:51,852] [INFO] [comm.py:675:init_distributed] cdb=None +worker-2: [2025-07-10 15:48:51,877] [INFO] [comm.py:675:init_distributed] cdb=None +worker-2: [2025-07-10 15:48:51,881] [INFO] [comm.py:675:init_distributed] cdb=None +worker-2: [2025-07-10 15:48:51,897] [INFO] [comm.py:675:init_distributed] cdb=None +worker-1: NCCL version 2.21.5+cuda12.4 +worker-1: ninja: no work to do. +worker-1: Time to load fused_adam op: 0.03760719299316406 seconds +worker-1: [2025-07-10 15:50:58,707] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-1: ninja: no work to do. +worker-1: Time to load fused_adam op: 0.04649543762207031 seconds +worker-1: [2025-07-10 15:50:59,709] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-1: ninja: no work to do. +worker-1: Time to load fused_adam op: 0.054692983627319336 seconds +worker-1: [2025-07-10 15:51:02,809] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-1: ninja: no work to do. +worker-1: Time to load fused_adam op: 0.041512250900268555 seconds +worker-1: [2025-07-10 15:51:02,864] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-1: Length: 30000 +worker-1: Sample data: {'input_ids': tensor([151644, 872, 198, ..., 151645, 151645, 151645]), 'attention_mask': tensor([1, 1, 1, ..., 0, 0, 0]), 'labels': tensor([151644, 872, 198, ..., -100, -100, -100])} +worker-1: ninja: no work to do. +worker-1: Time to load fused_adam op: 0.05083060264587402 seconds +worker-1: [2025-07-10 15:51:03,099] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed info: version=0.17.1, git-hash=unknown, git-branch=unknown +worker-1: [2025-07-10 15:51:03,099] [INFO] [comm.py:700:init_distributed] Distributed backend already initialized +worker-1: [2025-07-10 15:51:03,100] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-2: ninja: no work to do. +worker-2: Time to load fused_adam op: 0.050290584564208984 seconds +worker-2: [2025-07-10 15:51:03,698] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-1: ninja: no work to do. +worker-1: Time to load fused_adam op: 0.048990726470947266 seconds +worker-1: [2025-07-10 15:51:04,070] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-1: Time to load fused_adam op: 0.10116410255432129 seconds +worker-1: [2025-07-10 15:51:04,148] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-2: ninja: no work to do. +worker-2: Time to load fused_adam op: 0.03737616539001465 seconds +worker-2: [2025-07-10 15:51:04,462] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-1: ninja: no work to do. +worker-1: Time to load fused_adam op: 0.029782772064208984 seconds +worker-1: [2025-07-10 15:51:04,559] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-2: ninja: no work to do. +worker-2: Time to load fused_adam op: 0.04036116600036621 seconds +worker-2: [2025-07-10 15:51:07,176] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-2: ninja: no work to do. +worker-2: Time to load fused_adam op: 0.0366818904876709 seconds +worker-2: [2025-07-10 15:51:07,244] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-2: ninja: no work to do. +worker-2: Time to load fused_adam op: 0.04654741287231445 seconds +worker-2: [2025-07-10 15:51:08,011] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-1: [2025-07-10 15:51:09,308] [INFO] [engine.py:1325:_configure_distributed_model] ********** distributed groups summary ********** +worker-1: self.dp_world_size=16 +worker-1: self.mp_world_size=1 +worker-1: self.seq_dp_world_size=16 +worker-1: self.sequence_parallel_size=1 +worker-1: *********************************************** +worker-2: ninja: no work to do. +worker-2: Time to load fused_adam op: 0.05138659477233887 seconds +worker-2: [2025-07-10 15:51:09,389] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-2: ninja: no work to do. +worker-2: Time to load fused_adam op: 0.035462379455566406 seconds +worker-2: [2025-07-10 15:51:15,707] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-2: ninja: no work to do. +worker-2: Time to load fused_adam op: 0.03232145309448242 seconds +worker-2: [2025-07-10 15:51:20,621] [INFO] [config.py:655:__init__] Config mesh_device None world_size = 16 +worker-1: [2025-07-10 15:51:31,079] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +worker-1: [2025-07-10 15:51:31,080] [INFO] [logging.py:107:log_dist] [Rank 0] Using client Optimizer as basic optimizer +worker-1: [2025-07-10 15:51:31,081] [INFO] [logging.py:107:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +worker-1: [2025-07-10 15:51:31,085] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +worker-1: [2025-07-10 15:51:31,086] [INFO] [utils.py:59:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +worker-1: [2025-07-10 15:51:31,086] [INFO] [logging.py:107:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 1 optimizer +worker-1: [2025-07-10 15:51:31,086] [INFO] [stage_1_and_2.py:151:__init__] Reduce bucket size 500000000 +worker-1: [2025-07-10 15:51:31,086] [INFO] [stage_1_and_2.py:152:__init__] Allgather bucket size 500000000 +worker-1: [2025-07-10 15:51:31,086] [INFO] [stage_1_and_2.py:153:__init__] CPU Offload: False +worker-1: [2025-07-10 15:51:31,086] [INFO] [stage_1_and_2.py:154:__init__] Round robin gradient partitioning: False +worker-1: [2025-07-10 15:51:41,241] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states +worker-1: [2025-07-10 15:51:41,241] [INFO] [utils.py:782:see_memory_usage] MA 1.25 GB Max_MA 1.25 GB CA 1.25 GB Max_CA 1 GB +worker-1: [2025-07-10 15:51:41,242] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 745.39 GB, percent = 74.0% +worker-1: [2025-07-10 15:51:41,476] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states +worker-1: [2025-07-10 15:51:41,476] [INFO] [utils.py:782:see_memory_usage] MA 1.25 GB Max_MA 1.39 GB CA 1.39 GB Max_CA 1 GB +worker-1: [2025-07-10 15:51:41,477] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 745.41 GB, percent = 74.0% +worker-1: [2025-07-10 15:51:41,477] [INFO] [stage_1_and_2.py:573:__init__] optimizer state initialized +worker-1: [2025-07-10 15:51:41,690] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer +worker-1: [2025-07-10 15:51:41,691] [INFO] [utils.py:782:see_memory_usage] MA 1.25 GB Max_MA 1.25 GB CA 1.39 GB Max_CA 1 GB +worker-1: [2025-07-10 15:51:41,691] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 745.46 GB, percent = 74.0% +worker-1: [2025-07-10 15:51:41,692] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer +worker-1: [2025-07-10 15:51:41,693] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed using client LR scheduler +worker-1: [2025-07-10 15:51:41,693] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed LR Scheduler = +worker-1: [2025-07-10 15:51:41,693] [INFO] [logging.py:107:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] +worker-1: [2025-07-10 15:51:41,695] [INFO] [logging.py:107:log_dist] [Rank 0] [TorchCheckpointEngine] Initialized with serialization = True +worker-1: [2025-07-10 15:51:41,695] [INFO] [config.py:921:print] DeepSpeedEngine configuration: +worker-1: [2025-07-10 15:51:41,695] [INFO] [config.py:925:print] activation_checkpointing_config { +worker-1: "partition_activations": false, +worker-1: "contiguous_memory_optimization": false, +worker-1: "cpu_checkpointing": false, +worker-1: "number_checkpoints": null, +worker-1: "synchronize_checkpoint_boundary": false, +worker-1: "profile": false +worker-1: } +worker-1: [2025-07-10 15:51:41,695] [INFO] [config.py:925:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'intra_op_parallelism': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +worker-1: [2025-07-10 15:51:41,696] [INFO] [config.py:925:print] amp_enabled .................. False +worker-1: [2025-07-10 15:51:41,696] [INFO] [config.py:925:print] amp_params ................... False +worker-1: [2025-07-10 15:51:41,696] [INFO] [config.py:925:print] autotuning_config ............ { +worker-1: "enabled": false, +worker-1: "start_step": null, +worker-1: "end_step": null, +worker-1: "metric_path": null, +worker-1: "arg_mappings": null, +worker-1: "metric": "throughput", +worker-1: "model_info": null, +worker-1: "results_dir": "autotuning_results", +worker-1: "exps_dir": "autotuning_exps", +worker-1: "overwrite": true, +worker-1: "fast": true, +worker-1: "start_profile_step": 3, +worker-1: "end_profile_step": 5, +worker-1: "tuner_type": "gridsearch", +worker-1: "tuner_early_stopping": 5, +worker-1: "tuner_num_trials": 50, +worker-1: "model_info_path": null, +worker-1: "mp_size": 1, +worker-1: "max_train_batch_size": null, +worker-1: "min_train_batch_size": 1, +worker-1: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +worker-1: "min_train_micro_batch_size_per_gpu": 1, +worker-1: "num_tuning_micro_batch_sizes": 3 +worker-1: } +worker-1: [2025-07-10 15:51:41,696] [INFO] [config.py:925:print] bfloat16_config .............. enabled=True immediate_grad_update=False check_grad_overflow=False +worker-1: [2025-07-10 15:51:41,696] [INFO] [config.py:925:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': True, 'writer': None} +worker-1: [2025-07-10 15:51:41,696] [INFO] [config.py:925:print] checkpoint_parallel_write_pipeline False +worker-1: [2025-07-10 15:51:41,696] [INFO] [config.py:925:print] checkpoint_tag_validation_enabled True +worker-1: [2025-07-10 15:51:41,696] [INFO] [config.py:925:print] checkpoint_tag_validation_fail False +worker-1: [2025-07-10 15:51:41,696] [INFO] [config.py:925:print] comms_config ................. +worker-1: [2025-07-10 15:51:41,697] [INFO] [config.py:925:print] communication_data_type ...... None +worker-1: [2025-07-10 15:51:41,697] [INFO] [config.py:925:print] compile_config ............... deepcompile=False free_activation=False offload_activation=False offload_opt_states=False double_buffer=True symmetric_memory=False debug_log=False offload_parameters=False sync_before_reduce=False sync_after_reduce=False sync_before_allgather=False sync_after_allgather=False +worker-1: [2025-07-10 15:51:41,697] [INFO] [config.py:925:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +worker-1: [2025-07-10 15:51:41,697] [INFO] [config.py:925:print] curriculum_enabled_legacy .... False +worker-1: [2025-07-10 15:51:41,697] [INFO] [config.py:925:print] curriculum_params_legacy ..... False +worker-1: [2025-07-10 15:51:41,697] [INFO] [config.py:925:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'pin_memory': False, 'curriculum_learning': {'enabled': False}, 'dynamic_batching': {'enabled': False, 'lr_scaling_method': 'linear', 'min_batch_size': 1, 'max_batch_size': None, 'sequence_picking_order': 'dataloader', 'verbose': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +worker-1: [2025-07-10 15:51:41,697] [INFO] [config.py:925:print] data_efficiency_enabled ...... False +worker-1: [2025-07-10 15:51:41,697] [INFO] [config.py:925:print] dataloader_drop_last ......... False +worker-1: [2025-07-10 15:51:41,697] [INFO] [config.py:925:print] disable_allgather ............ False +worker-1: [2025-07-10 15:51:41,697] [INFO] [config.py:925:print] dump_state ................... False +worker-1: [2025-07-10 15:51:41,697] [INFO] [config.py:925:print] eigenvalue_enabled ........... False +worker-1: [2025-07-10 15:51:41,697] [INFO] [config.py:925:print] eigenvalue_gas_boundary_resolution 1 +worker-1: [2025-07-10 15:51:41,698] [INFO] [config.py:925:print] eigenvalue_layer_name ........ bert.encoder.layer +worker-1: [2025-07-10 15:51:41,698] [INFO] [config.py:925:print] eigenvalue_layer_num ......... 0 +worker-1: [2025-07-10 15:51:41,698] [INFO] [config.py:925:print] eigenvalue_max_iter .......... 100 +worker-1: [2025-07-10 15:51:41,698] [INFO] [config.py:925:print] eigenvalue_stability ......... 1e-06 +worker-1: [2025-07-10 15:51:41,698] [INFO] [config.py:925:print] eigenvalue_tol ............... 0.01 +worker-1: [2025-07-10 15:51:41,698] [INFO] [config.py:925:print] eigenvalue_verbose ........... False +worker-1: [2025-07-10 15:51:41,698] [INFO] [config.py:925:print] elasticity_enabled ........... False +worker-1: [2025-07-10 15:51:41,698] [INFO] [config.py:925:print] float16_config ............... enabled=False auto_cast=False loss_scale=0.0 initial_scale_power=16 loss_scale_window=1000 hysteresis=2 consecutive_hysteresis=False min_loss_scale=1 fp16_master_weights_and_grads=False +worker-1: [2025-07-10 15:51:41,698] [INFO] [config.py:925:print] flops_profiler_config ........ { +worker-1: "enabled": false, +worker-1: "recompute_fwd_factor": 0.0, +worker-1: "profile_step": 1, +worker-1: "module_depth": -1, +worker-1: "top_modules": 1, +worker-1: "detailed": true, +worker-1: "output_file": null +worker-1: } +worker-1: [2025-07-10 15:51:41,698] [INFO] [config.py:925:print] global_rank .................. 0 +worker-1: [2025-07-10 15:51:41,698] [INFO] [config.py:925:print] grad_accum_dtype ............. None +worker-1: [2025-07-10 15:51:41,698] [INFO] [config.py:925:print] gradient_accumulation_steps .. 16 +worker-1: [2025-07-10 15:51:41,699] [INFO] [config.py:925:print] gradient_clipping ............ 1.0 +worker-1: [2025-07-10 15:51:41,699] [INFO] [config.py:925:print] gradient_predivide_factor .... 1.0 +worker-1: [2025-07-10 15:51:41,699] [INFO] [config.py:925:print] graph_harvesting ............. False +worker-1: [2025-07-10 15:51:41,699] [INFO] [config.py:925:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +worker-1: [2025-07-10 15:51:41,699] [INFO] [config.py:925:print] load_universal_checkpoint .... False +worker-1: [2025-07-10 15:51:41,699] [INFO] [config.py:925:print] memory_breakdown ............. False +worker-1: [2025-07-10 15:51:41,699] [INFO] [config.py:925:print] mics_hierarchial_params_gather False +worker-1: [2025-07-10 15:51:41,699] [INFO] [config.py:925:print] mics_shard_size .............. -1 +worker-1: [2025-07-10 15:51:41,699] [INFO] [config.py:925:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='step1_tensorboard/ds_tensorboard_logs/', job_name='step1_model_tensorboard') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=True, group=None, team=None, project='foundationModel') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +worker-1: [2025-07-10 15:51:41,699] [INFO] [config.py:925:print] nebula_config ................ { +worker-1: "enabled": false, +worker-1: "persistent_storage_path": null, +worker-1: "persistent_time_interval": 100, +worker-1: "num_of_version_in_retention": 2, +worker-1: "enable_nebula_load": true, +worker-1: "load_path": null +worker-1: } +worker-1: [2025-07-10 15:51:41,699] [INFO] [config.py:925:print] optimizer_legacy_fusion ...... False +worker-1: [2025-07-10 15:51:41,699] [INFO] [config.py:925:print] optimizer_name ............... None +worker-1: [2025-07-10 15:51:41,700] [INFO] [config.py:925:print] optimizer_params ............. None +worker-1: [2025-07-10 15:51:41,700] [INFO] [config.py:925:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +worker-1: [2025-07-10 15:51:41,700] [INFO] [config.py:925:print] pld_enabled .................. False +worker-1: [2025-07-10 15:51:41,700] [INFO] [config.py:925:print] pld_params ................... False +worker-1: [2025-07-10 15:51:41,700] [INFO] [config.py:925:print] prescale_gradients ........... False +worker-1: [2025-07-10 15:51:41,700] [INFO] [config.py:925:print] scheduler_name ............... None +worker-1: [2025-07-10 15:51:41,700] [INFO] [config.py:925:print] scheduler_params ............. None +worker-1: [2025-07-10 15:51:41,700] [INFO] [config.py:925:print] seq_parallel_communication_data_type torch.float32 +worker-1: [2025-07-10 15:51:41,700] [INFO] [config.py:925:print] sparse_attention ............. None +worker-1: [2025-07-10 15:51:41,700] [INFO] [config.py:925:print] sparse_gradients_enabled ..... False +worker-1: [2025-07-10 15:51:41,700] [INFO] [config.py:925:print] steps_per_print .............. 10 +worker-1: [2025-07-10 15:51:41,700] [INFO] [config.py:925:print] tensor_parallel_config ....... dtype=torch.float16 autotp_size=0 tp_overlap_comm=False tensor_parallel=TPConfig(tp_size=1, tp_grain_size=1, mpu=None, tp_group=None) injection_policy_tuple=None keep_module_on_host=False replace_with_kernel_inject=False +worker-1: [2025-07-10 15:51:41,701] [INFO] [config.py:925:print] timers_config ................ enabled=True synchronized=True +worker-1: [2025-07-10 15:51:41,701] [INFO] [config.py:925:print] train_batch_size ............. 256 +worker-1: [2025-07-10 15:51:41,701] [INFO] [config.py:925:print] train_micro_batch_size_per_gpu 1 +worker-1: [2025-07-10 15:51:41,701] [INFO] [config.py:925:print] use_data_before_expert_parallel_ False +worker-1: [2025-07-10 15:51:41,701] [INFO] [config.py:925:print] use_node_local_storage ....... False +worker-1: [2025-07-10 15:51:41,701] [INFO] [config.py:925:print] wall_clock_breakdown ......... False +worker-1: [2025-07-10 15:51:41,701] [INFO] [config.py:925:print] weight_quantization_config ... None +worker-1: [2025-07-10 15:51:41,701] [INFO] [config.py:925:print] world_size ................... 16 +worker-1: [2025-07-10 15:51:41,701] [INFO] [config.py:925:print] zero_allow_untested_optimizer False +worker-1: [2025-07-10 15:51:41,701] [INFO] [config.py:925:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100000000, max_in_cpu=1000000000, pin_memory=False) offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=30000000 param_persistence_threshold=10000 model_persistence_threshold=9223372036854775807 max_live_parameters=30000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False module_granularity_threshold=0 use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False zeropp_loco_param=None mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=False pipeline_loading_checkpoint=False override_module_apply=True log_trace_cache_warnings=False +worker-1: [2025-07-10 15:51:41,701] [INFO] [config.py:925:print] zero_enabled ................. True +worker-1: [2025-07-10 15:51:41,701] [INFO] [config.py:925:print] zero_force_ds_cpu_optimizer .. True +worker-1: [2025-07-10 15:51:41,701] [INFO] [config.py:925:print] zero_optimization_stage ...... 1 +worker-1: [2025-07-10 15:51:41,702] [INFO] [config.py:911:print_user_config] json = { +worker-1: "train_batch_size": 256, +worker-1: "train_micro_batch_size_per_gpu": 1, +worker-1: "steps_per_print": 10, +worker-1: "zero_optimization": { +worker-1: "stage": 1, +worker-1: "overlap_comm": true, +worker-1: "offload_param": { +worker-1: "device": "none" +worker-1: }, +worker-1: "stage3_param_persistence_threshold": 1.000000e+04, +worker-1: "stage3_max_live_parameters": 3.000000e+07, +worker-1: "stage3_prefetch_bucket_size": 3.000000e+07, +worker-1: "memory_efficient_linear": false +worker-1: }, +worker-1: "bfloat16": { +worker-1: "enabled": true +worker-1: }, +worker-1: "gradient_clipping": 1.0, +worker-1: "prescale_gradients": false, +worker-1: "wall_clock_breakdown": false, +worker-1: "hybrid_engine": { +worker-1: "enabled": false, +worker-1: "max_out_tokens": 512, +worker-1: "inference_tp_size": 1, +worker-1: "release_inference_cache": false, +worker-1: "pin_parameters": true, +worker-1: "tp_gather_partition_size": 8 +worker-1: }, +worker-1: "tensorboard": { +worker-1: "enabled": false, +worker-1: "output_path": "step1_tensorboard/ds_tensorboard_logs/", +worker-1: "job_name": "step1_model_tensorboard" +worker-1: }, +worker-1: "wandb": { +worker-1: "enabled": true, +worker-1: "project": "foundationModel" +worker-1: } +worker-1: } +worker-1: ***** Running training ***** +worker-1: ***** Evaluating perplexity, Epoch 0/1 ***** +worker-1: ppl: 7.56785249710083, loss: 2.023909330368042 +worker-1: Beginning of Epoch 1/1, Total Micro Batches 1875 +worker-1: Model Parameters: 0.596 B, Latency: 2.79s, TFLOPs: 4.14, Samples/sec: 0.36, Time/seq 2.79s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.36s, TFLOPs: 4.89, Samples/sec: 0.42, Time/seq 2.36s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.29, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.68s, TFLOPs: 4.30, Samples/sec: 0.37, Time/seq 2.68s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.29, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.08s, TFLOPs: 2.83, Samples/sec: 0.25, Time/seq 4.08s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 3.00s, TFLOPs: 3.84, Samples/sec: 0.33, Time/seq 3.00s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.72s, TFLOPs: 4.25, Samples/sec: 0.37, Time/seq 2.72s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 3.93s, TFLOPs: 2.94, Samples/sec: 0.25, Time/seq 3.93s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 3.36s, TFLOPs: 3.44, Samples/sec: 0.30, Time/seq 3.36s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.17s, TFLOPs: 2.77, Samples/sec: 0.24, Time/seq 4.17s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.17s, TFLOPs: 2.77, Samples/sec: 0.24, Time/seq 4.17s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.13s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.13s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.10s, TFLOPs: 2.81, Samples/sec: 0.24, Time/seq 4.10s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.08s, TFLOPs: 2.82, Samples/sec: 0.24, Time/seq 4.08s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.17s, TFLOPs: 2.77, Samples/sec: 0.24, Time/seq 4.17s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.15s, TFLOPs: 2.78, Samples/sec: 0.24, Time/seq 4.15s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: [2025-07-10 15:59:13,868] [INFO] [logging.py:107:log_dist] [Rank 0] step=10, skipped=0, lr=[1.9300000000000002e-07, 1.9300000000000002e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +worker-1: [2025-07-10 15:59:14,476] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=10, RunningAvgSamplesPerSec=5.738281497247122, CurrSamplesPerSec=5.738691354970954, MemAllocated=6.48GB, MaxMemAllocated=38.22GB +worker-1: Model Parameters: 0.596 B, Latency: 4.13s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.13s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.14s, TFLOPs: 2.78, Samples/sec: 0.24, Time/seq 4.14s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.13s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.13s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.14s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.14s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.17s, TFLOPs: 2.77, Samples/sec: 0.24, Time/seq 4.17s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.12s, TFLOPs: 2.80, Samples/sec: 0.24, Time/seq 4.12s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.17s, TFLOPs: 2.77, Samples/sec: 0.24, Time/seq 4.17s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.13s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.13s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.15s, TFLOPs: 2.78, Samples/sec: 0.24, Time/seq 4.15s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.13s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.13s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: [2025-07-10 16:06:40,257] [INFO] [logging.py:107:log_dist] [Rank 0] step=20, skipped=0, lr=[3.8600000000000004e-07, 3.8600000000000004e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +worker-1: [2025-07-10 16:06:40,871] [INFO] [timer.py:264:stop] epoch=0/micro_step=320/global_step=20, RunningAvgSamplesPerSec=5.7385433114267235, CurrSamplesPerSec=5.749414131619069, MemAllocated=6.48GB, MaxMemAllocated=38.22GB +worker-1: Model Parameters: 0.596 B, Latency: 4.05s, TFLOPs: 2.85, Samples/sec: 0.25, Time/seq 4.05s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.18s, TFLOPs: 2.76, Samples/sec: 0.24, Time/seq 4.18s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.06s, TFLOPs: 2.84, Samples/sec: 0.25, Time/seq 4.06s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.08s, TFLOPs: 2.83, Samples/sec: 0.25, Time/seq 4.08s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.05s, TFLOPs: 2.85, Samples/sec: 0.25, Time/seq 4.05s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.15s, TFLOPs: 2.78, Samples/sec: 0.24, Time/seq 4.15s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.22s, TFLOPs: 2.74, Samples/sec: 0.24, Time/seq 4.22s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.06s, TFLOPs: 2.84, Samples/sec: 0.25, Time/seq 4.06s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.07s, TFLOPs: 2.83, Samples/sec: 0.25, Time/seq 4.07s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.30s, TFLOPs: 2.68, Samples/sec: 0.23, Time/seq 4.30s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: [2025-07-10 16:14:06,596] [INFO] [logging.py:107:log_dist] [Rank 0] step=30, skipped=0, lr=[5.79e-07, 5.79e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +worker-1: [2025-07-10 16:14:07,224] [INFO] [timer.py:264:stop] epoch=0/micro_step=480/global_step=30, RunningAvgSamplesPerSec=5.738738363303498, CurrSamplesPerSec=5.734018440689576, MemAllocated=6.48GB, MaxMemAllocated=38.22GB +worker-1: Model Parameters: 0.596 B, Latency: 4.18s, TFLOPs: 2.76, Samples/sec: 0.24, Time/seq 4.18s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.08s, TFLOPs: 2.83, Samples/sec: 0.25, Time/seq 4.08s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.12s, TFLOPs: 2.80, Samples/sec: 0.24, Time/seq 4.12s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.10s, TFLOPs: 2.81, Samples/sec: 0.24, Time/seq 4.10s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.17s, TFLOPs: 2.77, Samples/sec: 0.24, Time/seq 4.17s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.17s, TFLOPs: 2.77, Samples/sec: 0.24, Time/seq 4.17s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.19s, TFLOPs: 2.75, Samples/sec: 0.24, Time/seq 4.19s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.22s, TFLOPs: 2.73, Samples/sec: 0.24, Time/seq 4.22s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.21s, TFLOPs: 2.74, Samples/sec: 0.24, Time/seq 4.21s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.13s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.13s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: [2025-07-10 16:21:33,203] [INFO] [logging.py:107:log_dist] [Rank 0] step=40, skipped=0, lr=[7.720000000000001e-07, 7.720000000000001e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +worker-1: [2025-07-10 16:21:33,804] [INFO] [timer.py:264:stop] epoch=0/micro_step=640/global_step=40, RunningAvgSamplesPerSec=5.7379922579880605, CurrSamplesPerSec=5.7367844457490715, MemAllocated=6.48GB, MaxMemAllocated=38.22GB +worker-1: Model Parameters: 0.596 B, Latency: 4.15s, TFLOPs: 2.78, Samples/sec: 0.24, Time/seq 4.15s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.19s, TFLOPs: 2.75, Samples/sec: 0.24, Time/seq 4.19s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.10s, TFLOPs: 2.82, Samples/sec: 0.24, Time/seq 4.10s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.22s, TFLOPs: 2.74, Samples/sec: 0.24, Time/seq 4.22s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.31s, TFLOPs: 2.68, Samples/sec: 0.23, Time/seq 4.31s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.27s, TFLOPs: 2.70, Samples/sec: 0.23, Time/seq 4.27s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.27s, TFLOPs: 2.70, Samples/sec: 0.23, Time/seq 4.27s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.17s, TFLOPs: 2.77, Samples/sec: 0.24, Time/seq 4.17s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.23s, TFLOPs: 2.73, Samples/sec: 0.24, Time/seq 4.23s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.14s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.14s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: [2025-07-10 16:29:00,184] [INFO] [logging.py:107:log_dist] [Rank 0] step=50, skipped=0, lr=[9.65e-07, 9.65e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +worker-1: [2025-07-10 16:29:00,784] [INFO] [timer.py:264:stop] epoch=0/micro_step=800/global_step=50, RunningAvgSamplesPerSec=5.736484660399059, CurrSamplesPerSec=5.744866945079437, MemAllocated=6.48GB, MaxMemAllocated=38.22GB +worker-1: Model Parameters: 0.596 B, Latency: 4.08s, TFLOPs: 2.83, Samples/sec: 0.25, Time/seq 4.08s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.11s, TFLOPs: 2.81, Samples/sec: 0.24, Time/seq 4.11s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.21s, TFLOPs: 2.74, Samples/sec: 0.24, Time/seq 4.21s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.21s, TFLOPs: 2.74, Samples/sec: 0.24, Time/seq 4.21s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.16s, TFLOPs: 2.78, Samples/sec: 0.24, Time/seq 4.16s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.10s, TFLOPs: 2.82, Samples/sec: 0.24, Time/seq 4.10s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.12s, TFLOPs: 2.80, Samples/sec: 0.24, Time/seq 4.12s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.05s, TFLOPs: 2.85, Samples/sec: 0.25, Time/seq 4.05s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.04s, TFLOPs: 2.86, Samples/sec: 0.25, Time/seq 4.04s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.16s, TFLOPs: 2.77, Samples/sec: 0.24, Time/seq 4.16s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: [2025-07-10 16:36:26,462] [INFO] [logging.py:107:log_dist] [Rank 0] step=60, skipped=0, lr=[1.158e-06, 1.158e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +worker-1: [2025-07-10 16:36:27,075] [INFO] [timer.py:264:stop] epoch=0/micro_step=960/global_step=60, RunningAvgSamplesPerSec=5.737017997213853, CurrSamplesPerSec=5.740447369125361, MemAllocated=6.48GB, MaxMemAllocated=38.22GB +worker-1: Model Parameters: 0.596 B, Latency: 4.13s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.13s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.18s, TFLOPs: 2.76, Samples/sec: 0.24, Time/seq 4.18s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.16s, TFLOPs: 2.78, Samples/sec: 0.24, Time/seq 4.16s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.18s, TFLOPs: 2.76, Samples/sec: 0.24, Time/seq 4.18s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.19s, TFLOPs: 2.76, Samples/sec: 0.24, Time/seq 4.19s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.23s, TFLOPs: 2.73, Samples/sec: 0.24, Time/seq 4.23s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.28s, TFLOPs: 2.69, Samples/sec: 0.23, Time/seq 4.28s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.26s, TFLOPs: 2.71, Samples/sec: 0.23, Time/seq 4.26s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.25s, TFLOPs: 2.72, Samples/sec: 0.24, Time/seq 4.25s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.21s, TFLOPs: 2.74, Samples/sec: 0.24, Time/seq 4.21s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: [2025-07-10 16:43:53,524] [INFO] [logging.py:107:log_dist] [Rank 0] step=70, skipped=0, lr=[1.3510000000000003e-06, 1.3510000000000003e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +worker-1: [2025-07-10 16:43:54,141] [INFO] [timer.py:264:stop] epoch=0/micro_step=1120/global_step=70, RunningAvgSamplesPerSec=5.735943245253326, CurrSamplesPerSec=5.740644433984722, MemAllocated=6.48GB, MaxMemAllocated=38.22GB +worker-1: Model Parameters: 0.596 B, Latency: 4.11s, TFLOPs: 2.81, Samples/sec: 0.24, Time/seq 4.11s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.14s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.14s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.20s, TFLOPs: 2.74, Samples/sec: 0.24, Time/seq 4.20s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.15s, TFLOPs: 2.78, Samples/sec: 0.24, Time/seq 4.15s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.20s, TFLOPs: 2.74, Samples/sec: 0.24, Time/seq 4.20s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.12s, TFLOPs: 2.80, Samples/sec: 0.24, Time/seq 4.12s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.22s, TFLOPs: 2.74, Samples/sec: 0.24, Time/seq 4.22s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.27s, TFLOPs: 2.70, Samples/sec: 0.23, Time/seq 4.27s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.30s, TFLOPs: 2.69, Samples/sec: 0.23, Time/seq 4.30s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.12s, TFLOPs: 2.80, Samples/sec: 0.24, Time/seq 4.12s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: [2025-07-10 16:51:20,420] [INFO] [logging.py:107:log_dist] [Rank 0] step=80, skipped=0, lr=[1.5440000000000002e-06, 1.5440000000000002e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +worker-1: [2025-07-10 16:51:21,042] [INFO] [timer.py:264:stop] epoch=0/micro_step=1280/global_step=80, RunningAvgSamplesPerSec=5.735434178463924, CurrSamplesPerSec=5.732680863340236, MemAllocated=6.48GB, MaxMemAllocated=38.22GB +worker-1: Model Parameters: 0.596 B, Latency: 4.19s, TFLOPs: 2.75, Samples/sec: 0.24, Time/seq 4.19s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.28s, TFLOPs: 2.70, Samples/sec: 0.23, Time/seq 4.28s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.18s, TFLOPs: 2.76, Samples/sec: 0.24, Time/seq 4.18s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.14s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.14s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.13s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.13s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.16s, TFLOPs: 2.77, Samples/sec: 0.24, Time/seq 4.16s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.15s, TFLOPs: 2.78, Samples/sec: 0.24, Time/seq 4.15s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.14s, TFLOPs: 2.78, Samples/sec: 0.24, Time/seq 4.14s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.12s, TFLOPs: 2.80, Samples/sec: 0.24, Time/seq 4.12s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.15s, TFLOPs: 2.78, Samples/sec: 0.24, Time/seq 4.15s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: [2025-07-10 16:58:47,028] [INFO] [logging.py:107:log_dist] [Rank 0] step=90, skipped=0, lr=[1.737e-06, 1.737e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +worker-1: [2025-07-10 16:58:47,647] [INFO] [timer.py:264:stop] epoch=0/micro_step=1440/global_step=90, RunningAvgSamplesPerSec=5.735515109884195, CurrSamplesPerSec=5.741525851748672, MemAllocated=6.48GB, MaxMemAllocated=38.22GB +worker-1: Model Parameters: 0.596 B, Latency: 4.11s, TFLOPs: 2.81, Samples/sec: 0.24, Time/seq 4.11s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.13s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.13s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.17s, TFLOPs: 2.77, Samples/sec: 0.24, Time/seq 4.17s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.11s, TFLOPs: 2.81, Samples/sec: 0.24, Time/seq 4.11s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.13s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.13s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.17s, TFLOPs: 2.76, Samples/sec: 0.24, Time/seq 4.17s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.07s, TFLOPs: 2.84, Samples/sec: 0.25, Time/seq 4.07s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.08s, TFLOPs: 2.83, Samples/sec: 0.25, Time/seq 4.08s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.17s, TFLOPs: 2.77, Samples/sec: 0.24, Time/seq 4.17s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.08s, TFLOPs: 2.83, Samples/sec: 0.25, Time/seq 4.08s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: [2025-07-10 17:06:13,258] [INFO] [logging.py:107:log_dist] [Rank 0] step=100, skipped=0, lr=[1.93e-06, 1.93e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +worker-1: [2025-07-10 17:06:13,860] [INFO] [timer.py:264:stop] epoch=0/micro_step=1600/global_step=100, RunningAvgSamplesPerSec=5.736087469552836, CurrSamplesPerSec=5.747973942907533, MemAllocated=6.48GB, MaxMemAllocated=38.22GB +worker-1: Model Parameters: 0.596 B, Latency: 4.06s, TFLOPs: 2.84, Samples/sec: 0.25, Time/seq 4.06s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.69s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.69s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.07s, TFLOPs: 2.84, Samples/sec: 0.25, Time/seq 4.07s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.13s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.13s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.04s, TFLOPs: 2.85, Samples/sec: 0.25, Time/seq 4.04s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.05s, TFLOPs: 2.85, Samples/sec: 0.25, Time/seq 4.05s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.15s, TFLOPs: 2.78, Samples/sec: 0.24, Time/seq 4.15s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.18s, TFLOPs: 2.76, Samples/sec: 0.24, Time/seq 4.18s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.23s, TFLOPs: 2.73, Samples/sec: 0.24, Time/seq 4.23s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.18s, TFLOPs: 2.76, Samples/sec: 0.24, Time/seq 4.18s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.16s, TFLOPs: 2.77, Samples/sec: 0.24, Time/seq 4.16s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: [2025-07-10 17:13:39,607] [INFO] [logging.py:107:log_dist] [Rank 0] step=110, skipped=0, lr=[2.1230000000000003e-06, 2.1230000000000003e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +worker-1: [2025-07-10 17:13:40,261] [INFO] [timer.py:264:stop] epoch=0/micro_step=1760/global_step=110, RunningAvgSamplesPerSec=5.7363274566239095, CurrSamplesPerSec=5.733757776608223, MemAllocated=6.48GB, MaxMemAllocated=38.22GB +worker-1: Model Parameters: 0.596 B, Latency: 4.18s, TFLOPs: 2.76, Samples/sec: 0.24, Time/seq 4.18s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.24s, TFLOPs: 2.72, Samples/sec: 0.24, Time/seq 4.24s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.15s, TFLOPs: 2.78, Samples/sec: 0.24, Time/seq 4.15s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.07s, TFLOPs: 2.84, Samples/sec: 0.25, Time/seq 4.07s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.07s, TFLOPs: 2.83, Samples/sec: 0.25, Time/seq 4.07s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.16s, TFLOPs: 2.77, Samples/sec: 0.24, Time/seq 4.16s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.13s, TFLOPs: 2.79, Samples/sec: 0.24, Time/seq 4.13s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 4.22s, TFLOPs: 2.73, Samples/sec: 0.24, Time/seq 4.22s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.28, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: Model Parameters: 0.596 B, Latency: 2.70s, TFLOPs: 4.27, Samples/sec: 0.37, Time/seq 2.70s, Batch Size: 1, Sequence Length: 16384 +worker-1: ***** Evaluating perplexity, Epoch 1/1 ***** +worker-1: ppl: 3.6218554973602295, loss: 1.2869864702224731 +worker-1: saving the final model ... +worker-1: wandb: +worker-1: wandb: 🚀 View run silvery-wave-50 at: https://wandb.ai/jcdata/DeepSpeedExamples-applications_DeepSpeed-Chat_training_step1_supervised_finetuning/runs/lop44kuz +worker-1: wandb: Find logs at: wandb/run-20250710_154852-lop44kuz/logs +worker-1: [2025-07-10 17:19:24,841] [INFO] [launch.py:351:main] Process 114650 exits successfully. +worker-1: [2025-07-10 17:19:24,842] [INFO] [launch.py:351:main] Process 114645 exits successfully. +worker-1: [2025-07-10 17:19:24,842] [INFO] [launch.py:351:main] Process 114651 exits successfully. +worker-1: [2025-07-10 17:19:24,842] [INFO] [launch.py:351:main] Process 114646 exits successfully. +worker-2: [2025-07-10 17:19:25,581] [INFO] [launch.py:351:main] Process 19269 exits successfully. +worker-2: [2025-07-10 17:19:25,581] [INFO] [launch.py:351:main] Process 19273 exits successfully. +worker-1: [2025-07-10 17:19:25,842] [INFO] [launch.py:351:main] Process 114649 exits successfully. +worker-1: [2025-07-10 17:19:25,843] [INFO] [launch.py:351:main] Process 114648 exits successfully. +worker-1: [2025-07-10 17:19:26,843] [INFO] [launch.py:351:main] Process 114647 exits successfully. +worker-2: [2025-07-10 17:19:27,582] [INFO] [launch.py:351:main] Process 19268 exits successfully. +worker-2: [2025-07-10 17:19:27,582] [INFO] [launch.py:351:main] Process 19271 exits successfully. +worker-2: [2025-07-10 17:19:27,582] [INFO] [launch.py:351:main] Process 19270 exits successfully. +worker-1: [2025-07-10 17:19:29,844] [INFO] [launch.py:351:main] Process 114644 exits successfully. +worker-2: [2025-07-10 17:19:33,583] [INFO] [launch.py:351:main] Process 19267 exits successfully. +worker-2: [2025-07-10 17:19:33,584] [INFO] [launch.py:351:main] Process 19272 exits successfully. +worker-2: [2025-07-10 17:19:39,585] [INFO] [launch.py:351:main] Process 19266 exits successfully.