diff --git "a/train.log" "b/train.log" new file mode 100644--- /dev/null +++ "b/train.log" @@ -0,0 +1,2093 @@ +W0530 17:08:42.103000 10031 site-packages/torch/distributed/run.py:792] +W0530 17:08:42.103000 10031 site-packages/torch/distributed/run.py:792] ***************************************** +W0530 17:08:42.103000 10031 site-packages/torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0530 17:08:42.103000 10031 site-packages/torch/distributed/run.py:792] ***************************************** +Trainer._get_train_sampler replaced with custom implementation.Trainer._get_train_sampler replaced with custom implementation.Trainer._get_train_sampler replaced with custom implementation.Trainer._get_train_sampler replaced with custom implementation.Trainer._get_train_sampler replaced with custom implementation.Trainer._get_train_sampler replaced with custom implementation.Trainer._get_train_sampler replaced with custom implementation. +Trainer._get_train_sampler replaced with custom implementation. + + + + + + +[2025-05-30 17:08:51,009] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-30 17:08:51,044] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-30 17:08:51,046] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-30 17:08:51,049] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-30 17:08:51,052] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-30 17:08:51,056] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-30 17:08:51,069] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-30 17:08:51,071] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-30 17:08:51,857] [INFO] [comm.py:658:init_distributed] cdb=None +[2025-05-30 17:08:51,857] [INFO] [comm.py:658:init_distributed] cdb=None +[2025-05-30 17:08:51,857] [INFO] [comm.py:658:init_distributed] cdb=None +[2025-05-30 17:08:51,857] [INFO] [comm.py:658:init_distributed] cdb=None +[2025-05-30 17:08:51,857] [INFO] [comm.py:658:init_distributed] cdb=None +[2025-05-30 17:08:51,857] [INFO] [comm.py:658:init_distributed] cdb=None +[2025-05-30 17:08:51,857] [INFO] [comm.py:658:init_distributed] cdb=None +[2025-05-30 17:08:51,857] [INFO] [comm.py:658:init_distributed] cdb=None +[2025-05-30 17:08:51,857] [INFO] [comm.py:689:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. + Loading checkpoint shards: 0%| | 0/2 [00:00