|
+ deepspeed --master_port 16664 --module safe_rlhf.finetune --train_datasets inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json --model_name_or_path /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000 --max_length 512 --trust_remote_code True --epochs 1 --per_device_train_batch_size 1 --per_device_eval_batch_size 4 --gradient_accumulation_steps 8 --gradient_checkpointing --learning_rate 1e-5 --lr_warmup_ratio 0 --weight_decay 0.0 --lr_scheduler_type constant --weight_decay 0.0 --seed 42 --output_dir /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000-Q2-2000 --log_type wandb --log_run_name imdb-tinyllama-3T-s3-Q1-2000-Q2-2000 --log_project Inverse_Alignment_IMDb --zero_stage 3 --offload none --bf16 True --tf32 True --save_16bit |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
nvcc warning : incompatible redefinition for option |
|
[rank2]:[W527 21:41:44.146153892 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank6]:[W527 21:41:44.191854186 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 6] using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank1]:[W527 21:41:44.191879199 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank4]:[W527 21:41:44.192053507 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 4] using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank0]:[W527 21:41:44.206645211 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank5]:[W527 21:41:44.217991703 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 5] using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank7]:[W527 21:41:44.219162227 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 7] using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
[rank3]:[W527 21:41:44.234596145 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/config.json |
|
loading configuration file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/config.json |
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 5632, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 22, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 32000, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.52.1", |
|
"use_cache": true, |
|
"vocab_size": 32001 |
|
} |
|
|
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/pytorch_model.bin |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/pytorch_model.bin |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/pytorch_model.bin |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/pytorch_model.bin |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/pytorch_model.bin |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/pytorch_model.bin |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/pytorch_model.bin |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
loading weights file /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/pytorch_model.bin |
|
Will use torch_dtype=torch.float32 as defined in model |
|
Instantiating LlamaForCausalLM model under default dtype torch.float32. |
|
Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"pad_token_id": 32000 |
|
} |
|
|
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
Generation config file not found, using a generation config created from the model config. |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
Generation config file not found, using a generation config created from the model config. |
|
Generation config file not found, using a generation config created from the model config. |
|
Generation config file not found, using a generation config created from the model config. |
|
Generation config file not found, using a generation config created from the model config. |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
Generation config file not found, using a generation config created from the model config. |
|
Generation config file not found, using a generation config created from the model config. |
|
loading file tokenizer.model |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file tokenizer.model |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file tokenizer.model |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file tokenizer.model |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file tokenizer.model |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file tokenizer.model |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
loading file tokenizer.model |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
Generation config file not found, using a generation config created from the model config. |
|
loading file tokenizer.model |
|
loading file tokenizer.json |
|
loading file added_tokens.json |
|
loading file special_tokens_map.json |
|
loading file tokenizer_config.json |
|
loading file chat_template.jinja |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
Detected CUDA files, patching ldflags |
|
Emitting ninja build file /home/hansirui_1st/.cache/torch_extensions/py311_cu124/fused_adam/build.ninja... |
|
/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. |
|
If this is not desired, please set os.environ[ |
|
warnings.warn( |
|
Building extension module fused_adam... |
|
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam...Loading extension module fused_adam... |
|
|
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
Loading extension module fused_adam... |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
wandb: Currently logged in as: xtom to https://api.wandb.ai. Use `wandb login --relogin` to force relogin |
|
wandb: Tracking run with wandb version 0.19.11 |
|
wandb: Run data is saved locally in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000-Q2-2000/wandb/run-20250527_214202-zntqx0y1 |
|
wandb: Run `wandb offline` to turn off syncing. |
|
wandb: Syncing run imdb-tinyllama-3T-s3-Q1-2000-Q2-2000 |
|
wandb: βοΈ View project at https://wandb.ai/xtom/Inverse_Alignment_IMDb |
|
wandb: π View run at https://wandb.ai/xtom/Inverse_Alignment_IMDb/runs/zntqx0y1 |
|
Training 1/1 epoch: 0%| | 0/250 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
Training 1/1 epoch (loss 2.7305): 0%| | 0/250 [00:09<?, ?it/s]
Training 1/1 epoch (loss 2.7305): 0%| | 1/250 [00:09<39:49, 9.60s/it]
Training 1/1 epoch (loss 2.7596): 0%| | 1/250 [00:11<39:49, 9.60s/it]
Training 1/1 epoch (loss 2.7596): 1%| | 2/250 [00:11<22:03, 5.34s/it]
Training 1/1 epoch (loss 2.7038): 1%| | 2/250 [00:13<22:03, 5.34s/it]
Training 1/1 epoch (loss 2.7038): 1%| | 3/250 [00:13<13:56, 3.38s/it]
Training 1/1 epoch (loss 2.5496): 1%| | 3/250 [00:15<13:56, 3.38s/it]
Training 1/1 epoch (loss 2.5496): 2%|β | 4/250 [00:15<11:55, 2.91s/it]
Training 1/1 epoch (loss 2.6427): 2%|β | 4/250 [00:16<11:55, 2.91s/it]
Training 1/1 epoch (loss 2.6427): 2%|β | 5/250 [00:16<10:02, 2.46s/it]
Training 1/1 epoch (loss 3.0311): 2%|β | 5/250 [00:17<10:02, 2.46s/it]
Training 1/1 epoch (loss 3.0311): 2%|β | 6/250 [00:17<07:30, 1.85s/it]
Training 1/1 epoch (loss 2.7682): 2%|β | 6/250 [00:19<07:30, 1.85s/it]
Training 1/1 epoch (loss 2.7682): 3%|β | 7/250 [00:19<07:22, 1.82s/it]
Training 1/1 epoch (loss 2.6098): 3%|β | 7/250 [00:20<07:22, 1.82s/it]
Training 1/1 epoch (loss 2.6098): 3%|β | 8/250 [00:20<06:14, 1.55s/it]
Training 1/1 epoch (loss 2.7373): 3%|β | 8/250 [00:21<06:14, 1.55s/it]
Training 1/1 epoch (loss 2.7373): 4%|β | 9/250 [00:21<06:10, 1.54s/it]
Training 1/1 epoch (loss 2.6092): 4%|β | 9/250 [00:23<06:10, 1.54s/it]
Training 1/1 epoch (loss 2.6092): 4%|β | 10/250 [00:23<06:18, 1.58s/it]
Training 1/1 epoch (loss 2.4108): 4%|β | 10/250 [00:23<06:18, 1.58s/it]
Training 1/1 epoch (loss 2.4108): 4%|β | 11/250 [00:23<05:02, 1.27s/it]
Training 1/1 epoch (loss 2.5803): 4%|β | 11/250 [00:25<05:02, 1.27s/it]
Training 1/1 epoch (loss 2.5803): 5%|β | 12/250 [00:25<04:53, 1.23s/it]
Training 1/1 epoch (loss 2.9638): 5%|β | 12/250 [00:26<04:53, 1.23s/it]
Training 1/1 epoch (loss 2.9638): 5%|β | 13/250 [00:26<04:52, 1.24s/it]
Training 1/1 epoch (loss 2.6220): 5%|β | 13/250 [00:26<04:52, 1.24s/it]
Training 1/1 epoch (loss 2.6220): 6%|β | 14/250 [00:26<04:06, 1.05s/it]
Training 1/1 epoch (loss 2.4784): 6%|β | 14/250 [00:28<04:06, 1.05s/it]
Training 1/1 epoch (loss 2.4784): 6%|β | 15/250 [00:28<05:11, 1.33s/it]
Training 1/1 epoch (loss 2.5458): 6%|β | 15/250 [00:31<05:11, 1.33s/it]
Training 1/1 epoch (loss 2.5458): 6%|β | 16/250 [00:31<06:07, 1.57s/it]
Training 1/1 epoch (loss 2.7730): 6%|β | 16/250 [00:32<06:07, 1.57s/it]
Training 1/1 epoch (loss 2.7730): 7%|β | 17/250 [00:32<05:29, 1.42s/it]
Training 1/1 epoch (loss 2.5392): 7%|β | 17/250 [00:34<05:29, 1.42s/it]
Training 1/1 epoch (loss 2.5392): 7%|β | 18/250 [00:34<06:40, 1.72s/it]
Training 1/1 epoch (loss 2.5666): 7%|β | 18/250 [00:35<06:40, 1.72s/it]
Training 1/1 epoch (loss 2.5666): 8%|β | 19/250 [00:35<06:11, 1.61s/it]
Training 1/1 epoch (loss 2.7160): 8%|β | 19/250 [00:37<06:11, 1.61s/it]
Training 1/1 epoch (loss 2.7160): 8%|β | 20/250 [00:37<05:38, 1.47s/it]
Training 1/1 epoch (loss 2.6990): 8%|β | 20/250 [00:39<05:38, 1.47s/it]
Training 1/1 epoch (loss 2.6990): 8%|β | 21/250 [00:39<06:32, 1.71s/it]
Training 1/1 epoch (loss 2.8967): 8%|β | 21/250 [00:40<06:32, 1.71s/it]
Training 1/1 epoch (loss 2.8967): 9%|β | 22/250 [00:40<06:03, 1.59s/it]
Training 1/1 epoch (loss 2.7035): 9%|β | 22/250 [00:42<06:03, 1.59s/it]
Training 1/1 epoch (loss 2.7035): 9%|β | 23/250 [00:42<05:50, 1.55s/it]
Training 1/1 epoch (loss 2.8607): 9%|β | 23/250 [00:44<05:50, 1.55s/it]
Training 1/1 epoch (loss 2.8607): 10%|β | 24/250 [00:44<06:51, 1.82s/it]
Training 1/1 epoch (loss 2.6341): 10%|β | 24/250 [00:45<06:51, 1.82s/it]
Training 1/1 epoch (loss 2.6341): 10%|β | 25/250 [00:45<05:49, 1.55s/it]
Training 1/1 epoch (loss 2.6676): 10%|β | 25/250 [00:47<05:49, 1.55s/it]
Training 1/1 epoch (loss 2.6676): 10%|β | 26/250 [00:47<06:11, 1.66s/it]
Training 1/1 epoch (loss 2.4768): 10%|β | 26/250 [00:48<06:11, 1.66s/it]
Training 1/1 epoch (loss 2.4768): 11%|β | 27/250 [00:48<05:52, 1.58s/it]
Training 1/1 epoch (loss 2.6010): 11%|β | 27/250 [00:49<05:52, 1.58s/it]
Training 1/1 epoch (loss 2.6010): 11%|β | 28/250 [00:49<05:13, 1.41s/it]
Training 1/1 epoch (loss 2.6398): 11%|β | 28/250 [00:51<05:13, 1.41s/it]
Training 1/1 epoch (loss 2.6398): 12%|ββ | 29/250 [00:51<05:30, 1.50s/it]
Training 1/1 epoch (loss 2.8144): 12%|ββ | 29/250 [00:52<05:30, 1.50s/it]
Training 1/1 epoch (loss 2.8144): 12%|ββ | 30/250 [00:52<05:20, 1.46s/it]
Training 1/1 epoch (loss 2.6594): 12%|ββ | 30/250 [00:54<05:20, 1.46s/it]
Training 1/1 epoch (loss 2.6594): 12%|ββ | 31/250 [00:54<05:18, 1.45s/it]
Training 1/1 epoch (loss 2.6846): 12%|ββ | 31/250 [00:56<05:18, 1.45s/it]
Training 1/1 epoch (loss 2.6846): 13%|ββ | 32/250 [00:56<05:50, 1.61s/it]
Training 1/1 epoch (loss 2.8049): 13%|ββ | 32/250 [00:57<05:50, 1.61s/it]
Training 1/1 epoch (loss 2.8049): 13%|ββ | 33/250 [00:57<05:13, 1.44s/it]
Training 1/1 epoch (loss 2.5130): 13%|ββ | 33/250 [00:59<05:13, 1.44s/it]
Training 1/1 epoch (loss 2.5130): 14%|ββ | 34/250 [00:59<06:15, 1.74s/it]
Training 1/1 epoch (loss 2.4097): 14%|ββ | 34/250 [01:01<06:15, 1.74s/it]
Training 1/1 epoch (loss 2.4097): 14%|ββ | 35/250 [01:01<05:55, 1.65s/it]
Training 1/1 epoch (loss 2.8626): 14%|ββ | 35/250 [01:01<05:55, 1.65s/it]
Training 1/1 epoch (loss 2.8626): 14%|ββ | 36/250 [01:01<04:50, 1.36s/it]
Training 1/1 epoch (loss 2.5627): 14%|ββ | 36/250 [01:03<04:50, 1.36s/it]
Training 1/1 epoch (loss 2.5627): 15%|ββ | 37/250 [01:03<05:22, 1.51s/it]
Training 1/1 epoch (loss 2.6084): 15%|ββ | 37/250 [01:05<05:22, 1.51s/it]
Training 1/1 epoch (loss 2.6084): 15%|ββ | 38/250 [01:05<05:38, 1.60s/it]
Training 1/1 epoch (loss 2.6832): 15%|ββ | 38/250 [01:06<05:38, 1.60s/it]
Training 1/1 epoch (loss 2.6832): 16%|ββ | 39/250 [01:06<04:41, 1.33s/it]
Training 1/1 epoch (loss 2.3384): 16%|ββ | 39/250 [01:08<04:41, 1.33s/it]
Training 1/1 epoch (loss 2.3384): 16%|ββ | 40/250 [01:08<06:04, 1.73s/it]
Training 1/1 epoch (loss 2.6444): 16%|ββ | 40/250 [01:11<06:04, 1.73s/it]
Training 1/1 epoch (loss 2.6444): 16%|ββ | 41/250 [01:11<06:25, 1.85s/it]
Training 1/1 epoch (loss 2.5992): 16%|ββ | 41/250 [01:11<06:25, 1.85s/it]
Training 1/1 epoch (loss 2.5992): 17%|ββ | 42/250 [01:11<05:13, 1.51s/it]
Training 1/1 epoch (loss 2.4817): 17%|ββ | 42/250 [01:13<05:13, 1.51s/it]
Training 1/1 epoch (loss 2.4817): 17%|ββ | 43/250 [01:13<05:41, 1.65s/it]
Training 1/1 epoch (loss 2.5381): 17%|ββ | 43/250 [01:15<05:41, 1.65s/it]
Training 1/1 epoch (loss 2.5381): 18%|ββ | 44/250 [01:15<05:32, 1.61s/it]
Training 1/1 epoch (loss 2.6896): 18%|ββ | 44/250 [01:16<05:32, 1.61s/it]
Training 1/1 epoch (loss 2.6896): 18%|ββ | 45/250 [01:16<05:09, 1.51s/it]
Training 1/1 epoch (loss 2.7406): 18%|ββ | 45/250 [01:18<05:09, 1.51s/it]
Training 1/1 epoch (loss 2.7406): 18%|ββ | 46/250 [01:18<05:28, 1.61s/it]
Training 1/1 epoch (loss 2.7604): 18%|ββ | 46/250 [01:19<05:28, 1.61s/it]
Training 1/1 epoch (loss 2.7604): 19%|ββ | 47/250 [01:19<04:47, 1.42s/it]
Training 1/1 epoch (loss 2.7458): 19%|ββ | 47/250 [01:21<04:47, 1.42s/it]
Training 1/1 epoch (loss 2.7458): 19%|ββ | 48/250 [01:21<05:47, 1.72s/it]
Training 1/1 epoch (loss 2.5897): 19%|ββ | 48/250 [01:22<05:47, 1.72s/it]
Training 1/1 epoch (loss 2.5897): 20%|ββ | 49/250 [01:22<05:12, 1.55s/it]
Training 1/1 epoch (loss 2.6346): 20%|ββ | 49/250 [01:23<05:12, 1.55s/it]
Training 1/1 epoch (loss 2.6346): 20%|ββ | 50/250 [01:23<04:32, 1.36s/it]
Training 1/1 epoch (loss 2.7322): 20%|ββ | 50/250 [01:26<04:32, 1.36s/it]
Training 1/1 epoch (loss 2.7322): 20%|ββ | 51/250 [01:26<05:19, 1.61s/it]
Training 1/1 epoch (loss 2.6705): 20%|ββ | 51/250 [01:27<05:19, 1.61s/it]
Training 1/1 epoch (loss 2.6705): 21%|ββ | 52/250 [01:27<05:21, 1.63s/it]
Training 1/1 epoch (loss 2.5635): 21%|ββ | 52/250 [01:28<05:21, 1.63s/it]
Training 1/1 epoch (loss 2.5635): 21%|ββ | 53/250 [01:28<04:32, 1.38s/it]
Training 1/1 epoch (loss 2.7591): 21%|ββ | 53/250 [01:30<04:32, 1.38s/it]
Training 1/1 epoch (loss 2.7591): 22%|βββ | 54/250 [01:30<05:29, 1.68s/it]
Training 1/1 epoch (loss 2.4744): 22%|βββ | 54/250 [01:32<05:29, 1.68s/it]
Training 1/1 epoch (loss 2.4744): 22%|βββ | 55/250 [01:32<05:43, 1.76s/it]
Training 1/1 epoch (loss 2.4478): 22%|βββ | 55/250 [01:34<05:43, 1.76s/it]
Training 1/1 epoch (loss 2.4478): 22%|βββ | 56/250 [01:34<05:05, 1.57s/it]
Training 1/1 epoch (loss 2.5971): 22%|βββ | 56/250 [01:36<05:05, 1.57s/it]
Training 1/1 epoch (loss 2.5971): 23%|βββ | 57/250 [01:36<05:36, 1.74s/it]
Training 1/1 epoch (loss 2.8768): 23%|βββ | 57/250 [01:37<05:36, 1.74s/it]
Training 1/1 epoch (loss 2.8768): 23%|βββ | 58/250 [01:37<04:58, 1.56s/it]
Training 1/1 epoch (loss 2.8798): 23%|βββ | 58/250 [01:38<04:58, 1.56s/it]
Training 1/1 epoch (loss 2.8798): 24%|βββ | 59/250 [01:38<04:15, 1.34s/it]
Training 1/1 epoch (loss 2.5492): 24%|βββ | 59/250 [01:39<04:15, 1.34s/it]
Training 1/1 epoch (loss 2.5492): 24%|βββ | 60/250 [01:39<04:28, 1.41s/it]
Training 1/1 epoch (loss 2.5418): 24%|βββ | 60/250 [01:41<04:28, 1.41s/it]
Training 1/1 epoch (loss 2.5418): 24%|βββ | 61/250 [01:41<05:11, 1.65s/it]
Training 1/1 epoch (loss 2.7034): 24%|βββ | 61/250 [01:42<05:11, 1.65s/it]
Training 1/1 epoch (loss 2.7034): 25%|βββ | 62/250 [01:42<04:03, 1.30s/it]
Training 1/1 epoch (loss 2.7081): 25%|βββ | 62/250 [01:43<04:03, 1.30s/it]
Training 1/1 epoch (loss 2.7081): 25%|βββ | 63/250 [01:43<04:20, 1.39s/it]
Training 1/1 epoch (loss 2.5340): 25%|βββ | 63/250 [01:45<04:20, 1.39s/it]
Training 1/1 epoch (loss 2.5340): 26%|βββ | 64/250 [01:45<04:32, 1.47s/it]
Training 1/1 epoch (loss 2.6625): 26%|βββ | 64/250 [01:46<04:32, 1.47s/it]
Training 1/1 epoch (loss 2.6625): 26%|βββ | 65/250 [01:46<03:37, 1.18s/it]
Training 1/1 epoch (loss 2.6378): 26%|βββ | 65/250 [01:47<03:37, 1.18s/it]
Training 1/1 epoch (loss 2.6378): 26%|βββ | 66/250 [01:47<03:54, 1.28s/it]
Training 1/1 epoch (loss 2.6309): 26%|βββ | 66/250 [01:49<03:54, 1.28s/it]
Training 1/1 epoch (loss 2.6309): 27%|βββ | 67/250 [01:49<04:24, 1.45s/it]
Training 1/1 epoch (loss 2.4773): 27%|βββ | 67/250 [01:50<04:24, 1.45s/it]
Training 1/1 epoch (loss 2.4773): 27%|βββ | 68/250 [01:50<04:16, 1.41s/it]
Training 1/1 epoch (loss 2.4859): 27%|βββ | 68/250 [01:52<04:16, 1.41s/it]
Training 1/1 epoch (loss 2.4859): 28%|βββ | 69/250 [01:52<04:28, 1.48s/it]
Training 1/1 epoch (loss 2.8005): 28%|βββ | 69/250 [01:53<04:28, 1.48s/it]
Training 1/1 epoch (loss 2.8005): 28%|βββ | 70/250 [01:53<03:54, 1.31s/it]
Training 1/1 epoch (loss 2.5381): 28%|βββ | 70/250 [01:54<03:54, 1.31s/it]
Training 1/1 epoch (loss 2.5381): 28%|βββ | 71/250 [01:54<04:00, 1.35s/it]
Training 1/1 epoch (loss 2.6825): 28%|βββ | 71/250 [01:56<04:00, 1.35s/it]
Training 1/1 epoch (loss 2.6825): 29%|βββ | 72/250 [01:56<04:33, 1.53s/it]
Training 1/1 epoch (loss 2.4287): 29%|βββ | 72/250 [01:57<04:33, 1.53s/it]
Training 1/1 epoch (loss 2.4287): 29%|βββ | 73/250 [01:57<03:56, 1.34s/it]
Training 1/1 epoch (loss 2.4978): 29%|βββ | 73/250 [01:59<03:56, 1.34s/it]
Training 1/1 epoch (loss 2.4978): 30%|βββ | 74/250 [01:59<04:32, 1.55s/it]
Training 1/1 epoch (loss 2.7592): 30%|βββ | 74/250 [02:01<04:32, 1.55s/it]
Training 1/1 epoch (loss 2.7592): 30%|βββ | 75/250 [02:01<04:47, 1.64s/it]
Training 1/1 epoch (loss 2.4750): 30%|βββ | 75/250 [02:02<04:47, 1.64s/it]
Training 1/1 epoch (loss 2.4750): 30%|βββ | 76/250 [02:02<04:09, 1.43s/it]
Training 1/1 epoch (loss 2.4154): 30%|βββ | 76/250 [02:04<04:09, 1.43s/it]
Training 1/1 epoch (loss 2.4154): 31%|βββ | 77/250 [02:04<04:58, 1.73s/it]
Training 1/1 epoch (loss 2.5425): 31%|βββ | 77/250 [02:06<04:58, 1.73s/it]
Training 1/1 epoch (loss 2.5425): 31%|βββ | 78/250 [02:06<05:01, 1.75s/it]
Training 1/1 epoch (loss 2.6443): 31%|βββ | 78/250 [02:07<05:01, 1.75s/it]
Training 1/1 epoch (loss 2.6443): 32%|ββββ | 79/250 [02:07<04:34, 1.60s/it]
Training 1/1 epoch (loss 2.5997): 32%|ββββ | 79/250 [02:09<04:34, 1.60s/it]
Training 1/1 epoch (loss 2.5997): 32%|ββββ | 80/250 [02:09<04:39, 1.64s/it]
Training 1/1 epoch (loss 2.7390): 32%|ββββ | 80/250 [02:11<04:39, 1.64s/it]
Training 1/1 epoch (loss 2.7390): 32%|ββββ | 81/250 [02:11<04:30, 1.60s/it]
Training 1/1 epoch (loss 2.7965): 32%|ββββ | 81/250 [02:13<04:30, 1.60s/it]
Training 1/1 epoch (loss 2.7965): 33%|ββββ | 82/250 [02:13<05:10, 1.85s/it]
Training 1/1 epoch (loss 2.6808): 33%|ββββ | 82/250 [02:15<05:10, 1.85s/it]
Training 1/1 epoch (loss 2.6808): 33%|ββββ | 83/250 [02:15<05:11, 1.87s/it]
Training 1/1 epoch (loss 2.5373): 33%|ββββ | 83/250 [02:15<05:11, 1.87s/it]
Training 1/1 epoch (loss 2.5373): 34%|ββββ | 84/250 [02:15<04:00, 1.45s/it]
Training 1/1 epoch (loss 2.7656): 34%|ββββ | 84/250 [02:17<04:00, 1.45s/it]
Training 1/1 epoch (loss 2.7656): 34%|ββββ | 85/250 [02:17<04:19, 1.57s/it]
Training 1/1 epoch (loss 2.6217): 34%|ββββ | 85/250 [02:19<04:19, 1.57s/it]
Training 1/1 epoch (loss 2.6217): 34%|ββββ | 86/250 [02:19<04:19, 1.59s/it]
Training 1/1 epoch (loss 2.6395): 34%|ββββ | 86/250 [02:20<04:19, 1.59s/it]
Training 1/1 epoch (loss 2.6395): 35%|ββββ | 87/250 [02:20<03:29, 1.29s/it]
Training 1/1 epoch (loss 2.5972): 35%|ββββ | 87/250 [02:22<03:29, 1.29s/it]
Training 1/1 epoch (loss 2.5972): 35%|ββββ | 88/250 [02:22<04:46, 1.77s/it]
Training 1/1 epoch (loss 2.6683): 35%|ββββ | 88/250 [02:24<04:46, 1.77s/it]
Training 1/1 epoch (loss 2.6683): 36%|ββββ | 89/250 [02:24<04:48, 1.79s/it]
Training 1/1 epoch (loss 2.7665): 36%|ββββ | 89/250 [02:25<04:48, 1.79s/it]
Training 1/1 epoch (loss 2.7665): 36%|ββββ | 90/250 [02:25<04:02, 1.51s/it]
Training 1/1 epoch (loss 2.7083): 36%|ββββ | 90/250 [02:27<04:02, 1.51s/it]
Training 1/1 epoch (loss 2.7083): 36%|ββββ | 91/250 [02:27<04:32, 1.71s/it]
Training 1/1 epoch (loss 2.5036): 36%|ββββ | 91/250 [02:29<04:32, 1.71s/it]
Training 1/1 epoch (loss 2.5036): 37%|ββββ | 92/250 [02:29<04:18, 1.64s/it]
Training 1/1 epoch (loss 2.3626): 37%|ββββ | 92/250 [02:30<04:18, 1.64s/it]
Training 1/1 epoch (loss 2.3626): 37%|ββββ | 93/250 [02:30<03:57, 1.51s/it]
Training 1/1 epoch (loss 2.6957): 37%|ββββ | 93/250 [02:32<03:57, 1.51s/it]
Training 1/1 epoch (loss 2.6957): 38%|ββββ | 94/250 [02:32<04:09, 1.60s/it]
Training 1/1 epoch (loss 2.7236): 38%|ββββ | 94/250 [02:33<04:09, 1.60s/it]
Training 1/1 epoch (loss 2.7236): 38%|ββββ | 95/250 [02:33<04:09, 1.61s/it]
Training 1/1 epoch (loss 2.5953): 38%|ββββ | 95/250 [02:35<04:09, 1.61s/it]
Training 1/1 epoch (loss 2.5953): 38%|ββββ | 96/250 [02:35<04:09, 1.62s/it]
Training 1/1 epoch (loss 2.6531): 38%|ββββ | 96/250 [02:37<04:09, 1.62s/it]
Training 1/1 epoch (loss 2.6531): 39%|ββββ | 97/250 [02:37<04:39, 1.82s/it]
Training 1/1 epoch (loss 2.6405): 39%|ββββ | 97/250 [02:38<04:39, 1.82s/it]
Training 1/1 epoch (loss 2.6405): 39%|ββββ | 98/250 [02:38<03:52, 1.53s/it]
Training 1/1 epoch (loss 2.6494): 39%|ββββ | 98/250 [02:40<03:52, 1.53s/it]
Training 1/1 epoch (loss 2.6494): 40%|ββββ | 99/250 [02:40<04:13, 1.68s/it]
Training 1/1 epoch (loss 2.6768): 40%|ββββ | 99/250 [02:43<04:13, 1.68s/it]
Training 1/1 epoch (loss 2.6768): 40%|ββββ | 100/250 [02:43<04:44, 1.90s/it]
Training 1/1 epoch (loss 2.6081): 40%|ββββ | 100/250 [02:43<04:44, 1.90s/it]
Training 1/1 epoch (loss 2.6081): 40%|ββββ | 101/250 [02:43<03:40, 1.48s/it]
Training 1/1 epoch (loss 2.6653): 40%|ββββ | 101/250 [02:45<03:40, 1.48s/it]
Training 1/1 epoch (loss 2.6653): 41%|ββββ | 102/250 [02:45<03:56, 1.60s/it]
Training 1/1 epoch (loss 2.7290): 41%|ββββ | 102/250 [02:47<03:56, 1.60s/it]
Training 1/1 epoch (loss 2.7290): 41%|ββββ | 103/250 [02:47<04:06, 1.68s/it]
Training 1/1 epoch (loss 2.5855): 41%|ββββ | 103/250 [02:48<04:06, 1.68s/it]
Training 1/1 epoch (loss 2.5855): 42%|βββββ | 104/250 [02:48<03:21, 1.38s/it]
Training 1/1 epoch (loss 2.6622): 42%|βββββ | 104/250 [02:50<03:21, 1.38s/it]
Training 1/1 epoch (loss 2.6622): 42%|βββββ | 105/250 [02:50<03:43, 1.54s/it]
Training 1/1 epoch (loss 2.3912): 42%|βββββ | 105/250 [02:51<03:43, 1.54s/it]
Training 1/1 epoch (loss 2.3912): 42%|βββββ | 106/250 [02:51<03:47, 1.58s/it]
Training 1/1 epoch (loss 2.7620): 42%|βββββ | 106/250 [02:52<03:47, 1.58s/it]
Training 1/1 epoch (loss 2.7620): 43%|βββββ | 107/250 [02:52<03:17, 1.38s/it]
Training 1/1 epoch (loss 2.8349): 43%|βββββ | 107/250 [02:55<03:17, 1.38s/it]
Training 1/1 epoch (loss 2.8349): 43%|βββββ | 108/250 [02:55<04:05, 1.73s/it]
Training 1/1 epoch (loss 2.5155): 43%|βββββ | 108/250 [02:56<04:05, 1.73s/it]
Training 1/1 epoch (loss 2.5155): 44%|βββββ | 109/250 [02:56<04:06, 1.75s/it]
Training 1/1 epoch (loss 2.6244): 44%|βββββ | 109/250 [02:57<04:06, 1.75s/it]
Training 1/1 epoch (loss 2.6244): 44%|βββββ | 110/250 [02:57<03:31, 1.51s/it]
Training 1/1 epoch (loss 2.6141): 44%|βββββ | 110/250 [02:59<03:31, 1.51s/it]
Training 1/1 epoch (loss 2.6141): 44%|βββββ | 111/250 [02:59<03:33, 1.54s/it]
Training 1/1 epoch (loss 2.6967): 44%|βββββ | 111/250 [03:01<03:33, 1.54s/it]
Training 1/1 epoch (loss 2.6967): 45%|βββββ | 112/250 [03:01<03:32, 1.54s/it]
Training 1/1 epoch (loss 2.7176): 45%|βββββ | 112/250 [03:02<03:32, 1.54s/it]
Training 1/1 epoch (loss 2.7176): 45%|βββββ | 113/250 [03:02<03:24, 1.49s/it]
Training 1/1 epoch (loss 2.7474): 45%|βββββ | 113/250 [03:03<03:24, 1.49s/it]
Training 1/1 epoch (loss 2.7474): 46%|βββββ | 114/250 [03:03<03:23, 1.49s/it]
Training 1/1 epoch (loss 2.5070): 46%|βββββ | 114/250 [03:04<03:23, 1.49s/it]
Training 1/1 epoch (loss 2.5070): 46%|βββββ | 115/250 [03:04<02:52, 1.28s/it]
Training 1/1 epoch (loss 2.6572): 46%|βββββ | 115/250 [03:06<02:52, 1.28s/it]
Training 1/1 epoch (loss 2.6572): 46%|βββββ | 116/250 [03:06<03:24, 1.53s/it]
Training 1/1 epoch (loss 2.7441): 46%|βββββ | 116/250 [03:08<03:24, 1.53s/it]
Training 1/1 epoch (loss 2.7441): 47%|βββββ | 117/250 [03:08<03:22, 1.52s/it]
Training 1/1 epoch (loss 2.5823): 47%|βββββ | 117/250 [03:09<03:22, 1.52s/it]
Training 1/1 epoch (loss 2.5823): 47%|βββββ | 118/250 [03:09<02:52, 1.31s/it]
Training 1/1 epoch (loss 2.6220): 47%|βββββ | 118/250 [03:11<02:52, 1.31s/it]
Training 1/1 epoch (loss 2.6220): 48%|βββββ | 119/250 [03:11<03:39, 1.68s/it]
Training 1/1 epoch (loss 2.6472): 48%|βββββ | 119/250 [03:14<03:39, 1.68s/it]
Training 1/1 epoch (loss 2.6472): 48%|βββββ | 120/250 [03:14<04:04, 1.88s/it]
Training 1/1 epoch (loss 2.6828): 48%|βββββ | 120/250 [03:14<04:04, 1.88s/it]
Training 1/1 epoch (loss 2.6828): 48%|βββββ | 121/250 [03:14<03:17, 1.53s/it]
Training 1/1 epoch (loss 2.5219): 48%|βββββ | 121/250 [03:16<03:17, 1.53s/it]
Training 1/1 epoch (loss 2.5219): 49%|βββββ | 122/250 [03:16<03:35, 1.68s/it]
Training 1/1 epoch (loss 2.6395): 49%|βββββ | 122/250 [03:18<03:35, 1.68s/it]
Training 1/1 epoch (loss 2.6395): 49%|βββββ | 123/250 [03:18<03:21, 1.59s/it]
Training 1/1 epoch (loss 2.8004): 49%|βββββ | 123/250 [03:19<03:21, 1.59s/it]
Training 1/1 epoch (loss 2.8004): 50%|βββββ | 124/250 [03:19<02:56, 1.40s/it]
Training 1/1 epoch (loss 2.6163): 50%|βββββ | 124/250 [03:21<02:56, 1.40s/it]
Training 1/1 epoch (loss 2.6163): 50%|βββββ | 125/250 [03:21<03:33, 1.71s/it]
Training 1/1 epoch (loss 2.8392): 50%|βββββ | 125/250 [03:22<03:33, 1.71s/it]
Training 1/1 epoch (loss 2.8392): 50%|βββββ | 126/250 [03:22<03:15, 1.58s/it]
Training 1/1 epoch (loss 2.6884): 50%|βββββ | 126/250 [03:23<03:15, 1.58s/it]
Training 1/1 epoch (loss 2.6884): 51%|βββββ | 127/250 [03:23<02:43, 1.33s/it]
Training 1/1 epoch (loss 2.8049): 51%|βββββ | 127/250 [03:24<02:43, 1.33s/it]
Training 1/1 epoch (loss 2.8049): 51%|βββββ | 128/250 [03:24<02:40, 1.32s/it]
Training 1/1 epoch (loss 2.5552): 51%|βββββ | 128/250 [03:26<02:40, 1.32s/it]
Training 1/1 epoch (loss 2.5552): 52%|ββββββ | 129/250 [03:26<02:45, 1.37s/it]
Training 1/1 epoch (loss 2.5443): 52%|ββββββ | 129/250 [03:28<02:45, 1.37s/it]
Training 1/1 epoch (loss 2.5443): 52%|ββββββ | 130/250 [03:28<03:21, 1.68s/it]
Training 1/1 epoch (loss 2.6326): 52%|ββββββ | 130/250 [03:30<03:21, 1.68s/it]
Training 1/1 epoch (loss 2.6326): 52%|ββββββ | 131/250 [03:30<03:34, 1.80s/it]
Training 1/1 epoch (loss 2.4792): 52%|ββββββ | 131/250 [03:31<03:34, 1.80s/it]
Training 1/1 epoch (loss 2.4792): 53%|ββββββ | 132/250 [03:31<02:56, 1.49s/it]
Training 1/1 epoch (loss 2.5971): 53%|ββββββ | 132/250 [03:32<02:56, 1.49s/it]
Training 1/1 epoch (loss 2.5971): 53%|ββββββ | 133/250 [03:32<02:26, 1.26s/it]
Training 1/1 epoch (loss 2.7068): 53%|ββββββ | 133/250 [03:33<02:26, 1.26s/it]
Training 1/1 epoch (loss 2.7068): 54%|ββββββ | 134/250 [03:33<02:30, 1.30s/it]
Training 1/1 epoch (loss 2.7084): 54%|ββββββ | 134/250 [03:34<02:30, 1.30s/it]
Training 1/1 epoch (loss 2.7084): 54%|ββββββ | 135/250 [03:34<02:16, 1.18s/it]
Training 1/1 epoch (loss 2.8996): 54%|ββββββ | 135/250 [03:36<02:16, 1.18s/it]
Training 1/1 epoch (loss 2.8996): 54%|ββββββ | 136/250 [03:36<02:26, 1.29s/it]
Training 1/1 epoch (loss 2.6853): 54%|ββββββ | 136/250 [03:36<02:26, 1.29s/it]
Training 1/1 epoch (loss 2.6853): 55%|ββββββ | 137/250 [03:36<02:11, 1.16s/it]
Training 1/1 epoch (loss 2.6580): 55%|ββββββ | 137/250 [03:37<02:11, 1.16s/it]
Training 1/1 epoch (loss 2.6580): 55%|ββββββ | 138/250 [03:37<02:03, 1.10s/it]
Training 1/1 epoch (loss 2.5564): 55%|ββββββ | 138/250 [03:39<02:03, 1.10s/it]
Training 1/1 epoch (loss 2.5564): 56%|ββββββ | 139/250 [03:39<02:14, 1.21s/it]
Training 1/1 epoch (loss 2.9477): 56%|ββββββ | 139/250 [03:40<02:14, 1.21s/it]
Training 1/1 epoch (loss 2.9477): 56%|ββββββ | 140/250 [03:40<02:18, 1.26s/it]
Training 1/1 epoch (loss 2.4639): 56%|ββββββ | 140/250 [03:41<02:18, 1.26s/it]
Training 1/1 epoch (loss 2.4639): 56%|ββββββ | 141/250 [03:41<01:53, 1.05s/it]
Training 1/1 epoch (loss 2.6931): 56%|ββββββ | 141/250 [03:42<01:53, 1.05s/it]
Training 1/1 epoch (loss 2.6931): 57%|ββββββ | 142/250 [03:42<02:08, 1.19s/it]
Training 1/1 epoch (loss 2.6682): 57%|ββββββ | 142/250 [03:44<02:08, 1.19s/it]
Training 1/1 epoch (loss 2.6682): 57%|ββββββ | 143/250 [03:44<02:26, 1.37s/it]
Training 1/1 epoch (loss 2.6301): 57%|ββββββ | 143/250 [03:45<02:26, 1.37s/it]
Training 1/1 epoch (loss 2.6301): 58%|ββββββ | 144/250 [03:45<02:03, 1.16s/it]
Training 1/1 epoch (loss 2.5366): 58%|ββββββ | 144/250 [03:47<02:03, 1.16s/it]
Training 1/1 epoch (loss 2.5366): 58%|ββββββ | 145/250 [03:47<02:29, 1.43s/it]
Training 1/1 epoch (loss 2.6754): 58%|ββββββ | 145/250 [03:48<02:29, 1.43s/it]
Training 1/1 epoch (loss 2.6754): 58%|ββββββ | 146/250 [03:48<02:19, 1.34s/it]
Training 1/1 epoch (loss 2.7084): 58%|ββββββ | 146/250 [03:49<02:19, 1.34s/it]
Training 1/1 epoch (loss 2.7084): 59%|ββββββ | 147/250 [03:49<02:10, 1.27s/it]
Training 1/1 epoch (loss 2.7715): 59%|ββββββ | 147/250 [03:51<02:10, 1.27s/it]
Training 1/1 epoch (loss 2.7715): 59%|ββββββ | 148/250 [03:51<02:28, 1.46s/it]
Training 1/1 epoch (loss 2.7104): 59%|ββββββ | 148/250 [03:52<02:28, 1.46s/it]
Training 1/1 epoch (loss 2.7104): 60%|ββββββ | 149/250 [03:52<02:16, 1.36s/it]
Training 1/1 epoch (loss 2.7454): 60%|ββββββ | 149/250 [03:53<02:16, 1.36s/it]
Training 1/1 epoch (loss 2.7454): 60%|ββββββ | 150/250 [03:53<02:12, 1.33s/it]
Training 1/1 epoch (loss 2.4527): 60%|ββββββ | 150/250 [03:55<02:12, 1.33s/it]
Training 1/1 epoch (loss 2.4527): 60%|ββββββ | 151/250 [03:55<02:31, 1.53s/it]
Training 1/1 epoch (loss 2.7257): 60%|ββββββ | 151/250 [03:57<02:31, 1.53s/it]
Training 1/1 epoch (loss 2.7257): 61%|ββββββ | 152/250 [03:57<02:18, 1.41s/it]
Training 1/1 epoch (loss 2.5075): 61%|ββββββ | 152/250 [03:58<02:18, 1.41s/it]
Training 1/1 epoch (loss 2.5075): 61%|ββββββ | 153/250 [03:58<02:09, 1.33s/it]
Training 1/1 epoch (loss 2.6387): 61%|ββββββ | 153/250 [04:00<02:09, 1.33s/it]
Training 1/1 epoch (loss 2.6387): 62%|βββββββ | 154/250 [04:00<02:26, 1.53s/it]
Training 1/1 epoch (loss 2.7369): 62%|βββββββ | 154/250 [04:02<02:26, 1.53s/it]
Training 1/1 epoch (loss 2.7369): 62%|βββββββ | 155/250 [04:02<02:38, 1.67s/it]
Training 1/1 epoch (loss 2.7753): 62%|βββββββ | 155/250 [04:03<02:38, 1.67s/it]
Training 1/1 epoch (loss 2.7753): 62%|βββββββ | 156/250 [04:03<02:21, 1.51s/it]
Training 1/1 epoch (loss 2.5635): 62%|βββββββ | 156/250 [04:04<02:21, 1.51s/it]
Training 1/1 epoch (loss 2.5635): 63%|βββββββ | 157/250 [04:04<02:23, 1.54s/it]
Training 1/1 epoch (loss 2.5672): 63%|βββββββ | 157/250 [04:06<02:23, 1.54s/it]
Training 1/1 epoch (loss 2.5672): 63%|βββββββ | 158/250 [04:06<02:09, 1.41s/it]
Training 1/1 epoch (loss 2.5791): 63%|βββββββ | 158/250 [04:06<02:09, 1.41s/it]
Training 1/1 epoch (loss 2.5791): 64%|βββββββ | 159/250 [04:06<01:49, 1.20s/it]
Training 1/1 epoch (loss 2.4574): 64%|βββββββ | 159/250 [04:07<01:49, 1.20s/it]
Training 1/1 epoch (loss 2.4574): 64%|βββββββ | 160/250 [04:07<01:44, 1.16s/it]
Training 1/1 epoch (loss 2.6573): 64%|βββββββ | 160/250 [04:09<01:44, 1.16s/it]
Training 1/1 epoch (loss 2.6573): 64%|βββββββ | 161/250 [04:09<01:53, 1.27s/it]
Training 1/1 epoch (loss 2.5136): 64%|βββββββ | 161/250 [04:09<01:53, 1.27s/it]
Training 1/1 epoch (loss 2.5136): 65%|βββββββ | 162/250 [04:09<01:34, 1.08s/it]
Training 1/1 epoch (loss 2.5283): 65%|βββββββ | 162/250 [04:12<01:34, 1.08s/it]
Training 1/1 epoch (loss 2.5283): 65%|βββββββ | 163/250 [04:12<01:59, 1.37s/it]
Training 1/1 epoch (loss 2.5575): 65%|βββββββ | 163/250 [04:12<01:59, 1.37s/it]
Training 1/1 epoch (loss 2.5575): 66%|βββββββ | 164/250 [04:12<01:47, 1.25s/it]
Training 1/1 epoch (loss 2.4465): 66%|βββββββ | 164/250 [04:14<01:47, 1.25s/it]
Training 1/1 epoch (loss 2.4465): 66%|βββββββ | 165/250 [04:14<01:53, 1.33s/it]
Training 1/1 epoch (loss 2.3311): 66%|βββββββ | 165/250 [04:16<01:53, 1.33s/it]
Training 1/1 epoch (loss 2.3311): 66%|βββββββ | 166/250 [04:16<02:20, 1.68s/it]
Training 1/1 epoch (loss 2.7088): 66%|βββββββ | 166/250 [04:17<02:20, 1.68s/it]
Training 1/1 epoch (loss 2.7088): 67%|βββββββ | 167/250 [04:17<01:50, 1.33s/it]
Training 1/1 epoch (loss 2.8158): 67%|βββββββ | 167/250 [04:19<01:50, 1.33s/it]
Training 1/1 epoch (loss 2.8158): 67%|βββββββ | 168/250 [04:19<02:12, 1.62s/it]
Training 1/1 epoch (loss 2.6012): 67%|βββββββ | 168/250 [04:22<02:12, 1.62s/it]
Training 1/1 epoch (loss 2.6012): 68%|βββββββ | 169/250 [04:22<02:31, 1.87s/it]
Training 1/1 epoch (loss 2.7014): 68%|βββββββ | 169/250 [04:22<02:31, 1.87s/it]
Training 1/1 epoch (loss 2.7014): 68%|βββββββ | 170/250 [04:22<01:54, 1.43s/it]
Training 1/1 epoch (loss 2.6927): 68%|βββββββ | 170/250 [04:24<01:54, 1.43s/it]
Training 1/1 epoch (loss 2.6927): 68%|βββββββ | 171/250 [04:24<02:08, 1.63s/it]
Training 1/1 epoch (loss 2.4914): 68%|βββββββ | 171/250 [04:26<02:08, 1.63s/it]
Training 1/1 epoch (loss 2.4914): 69%|βββββββ | 172/250 [04:26<01:59, 1.53s/it]
Training 1/1 epoch (loss 2.5766): 69%|βββββββ | 172/250 [04:26<01:59, 1.53s/it]
Training 1/1 epoch (loss 2.5766): 69%|βββββββ | 173/250 [04:26<01:34, 1.22s/it]
Training 1/1 epoch (loss 2.7001): 69%|βββββββ | 173/250 [04:28<01:34, 1.22s/it]
Training 1/1 epoch (loss 2.7001): 70%|βββββββ | 174/250 [04:28<01:42, 1.35s/it]
Training 1/1 epoch (loss 2.7079): 70%|βββββββ | 174/250 [04:30<01:42, 1.35s/it]
Training 1/1 epoch (loss 2.7079): 70%|βββββββ | 175/250 [04:30<02:04, 1.67s/it]
Training 1/1 epoch (loss 2.6987): 70%|βββββββ | 175/250 [04:31<02:04, 1.67s/it]
Training 1/1 epoch (loss 2.6987): 70%|βββββββ | 176/250 [04:31<01:41, 1.38s/it]
Training 1/1 epoch (loss 2.7118): 70%|βββββββ | 176/250 [04:33<01:41, 1.38s/it]
Training 1/1 epoch (loss 2.7118): 71%|βββββββ | 177/250 [04:33<01:51, 1.53s/it]
Training 1/1 epoch (loss 2.8273): 71%|βββββββ | 177/250 [04:35<01:51, 1.53s/it]
Training 1/1 epoch (loss 2.8273): 71%|βββββββ | 178/250 [04:35<02:00, 1.67s/it]
Training 1/1 epoch (loss 2.7618): 71%|βββββββ | 178/250 [04:35<02:00, 1.67s/it]
Training 1/1 epoch (loss 2.7618): 72%|ββββββββ | 179/250 [04:35<01:35, 1.35s/it]
Training 1/1 epoch (loss 2.4809): 72%|ββββββββ | 179/250 [04:38<01:35, 1.35s/it]
Training 1/1 epoch (loss 2.4809): 72%|ββββββββ | 180/250 [04:38<01:54, 1.63s/it]
Training 1/1 epoch (loss 2.5991): 72%|ββββββββ | 180/250 [04:40<01:54, 1.63s/it]
Training 1/1 epoch (loss 2.5991): 72%|ββββββββ | 181/250 [04:40<01:59, 1.73s/it]
Training 1/1 epoch (loss 2.6308): 72%|ββββββββ | 181/250 [04:41<01:59, 1.73s/it]
Training 1/1 epoch (loss 2.6308): 73%|ββββββββ | 182/250 [04:41<01:47, 1.58s/it]
Training 1/1 epoch (loss 2.4013): 73%|ββββββββ | 182/250 [04:43<01:47, 1.58s/it]
Training 1/1 epoch (loss 2.4013): 73%|ββββββββ | 183/250 [04:43<02:02, 1.82s/it]
Training 1/1 epoch (loss 2.6239): 73%|ββββββββ | 183/250 [04:44<02:02, 1.82s/it]
Training 1/1 epoch (loss 2.6239): 74%|ββββββββ | 184/250 [04:44<01:45, 1.60s/it]
Training 1/1 epoch (loss 2.7276): 74%|ββββββββ | 184/250 [04:45<01:45, 1.60s/it]
Training 1/1 epoch (loss 2.7276): 74%|ββββββββ | 185/250 [04:45<01:30, 1.39s/it]
Training 1/1 epoch (loss 2.7126): 74%|ββββββββ | 185/250 [04:46<01:30, 1.39s/it]
Training 1/1 epoch (loss 2.7126): 74%|ββββββββ | 186/250 [04:46<01:23, 1.30s/it]
Training 1/1 epoch (loss 2.6814): 74%|ββββββββ | 186/250 [04:48<01:23, 1.30s/it]
Training 1/1 epoch (loss 2.6814): 75%|ββββββββ | 187/250 [04:48<01:28, 1.41s/it]
Training 1/1 epoch (loss 2.7946): 75%|ββββββββ | 187/250 [04:49<01:28, 1.41s/it]
Training 1/1 epoch (loss 2.7946): 75%|ββββββββ | 188/250 [04:49<01:22, 1.32s/it]
Training 1/1 epoch (loss 2.4917): 75%|ββββββββ | 188/250 [04:51<01:22, 1.32s/it]
Training 1/1 epoch (loss 2.4917): 76%|ββββββββ | 189/250 [04:51<01:28, 1.44s/it]
Training 1/1 epoch (loss 2.6280): 76%|ββββββββ | 189/250 [04:52<01:28, 1.44s/it]
Training 1/1 epoch (loss 2.6280): 76%|ββββββββ | 190/250 [04:52<01:21, 1.35s/it]
Training 1/1 epoch (loss 2.7416): 76%|ββββββββ | 190/250 [04:54<01:21, 1.35s/it]
Training 1/1 epoch (loss 2.7416): 76%|ββββββββ | 191/250 [04:54<01:36, 1.64s/it]
Training 1/1 epoch (loss 2.6668): 76%|ββββββββ | 191/250 [04:57<01:36, 1.64s/it]
Training 1/1 epoch (loss 2.6668): 77%|ββββββββ | 192/250 [04:57<01:48, 1.87s/it]
Training 1/1 epoch (loss 2.4909): 77%|ββββββββ | 192/250 [04:57<01:48, 1.87s/it]
Training 1/1 epoch (loss 2.4909): 77%|ββββββββ | 193/250 [04:57<01:23, 1.46s/it]
Training 1/1 epoch (loss 2.5730): 77%|ββββββββ | 193/250 [04:59<01:23, 1.46s/it]
Training 1/1 epoch (loss 2.5730): 78%|ββββββββ | 194/250 [04:59<01:35, 1.70s/it]
Training 1/1 epoch (loss 2.6334): 78%|ββββββββ | 194/250 [05:01<01:35, 1.70s/it]
Training 1/1 epoch (loss 2.6334): 78%|ββββββββ | 195/250 [05:01<01:40, 1.82s/it]
Training 1/1 epoch (loss 2.5094): 78%|ββββββββ | 195/250 [05:02<01:40, 1.82s/it]
Training 1/1 epoch (loss 2.5094): 78%|ββββββββ | 196/250 [05:02<01:17, 1.44s/it]
Training 1/1 epoch (loss 2.5679): 78%|ββββββββ | 196/250 [05:04<01:17, 1.44s/it]
Training 1/1 epoch (loss 2.5679): 79%|ββββββββ | 197/250 [05:04<01:25, 1.62s/it]
Training 1/1 epoch (loss 2.5704): 79%|ββββββββ | 197/250 [05:06<01:25, 1.62s/it]
Training 1/1 epoch (loss 2.5704): 79%|ββββββββ | 198/250 [05:06<01:23, 1.61s/it]
Training 1/1 epoch (loss 2.5891): 79%|ββββββββ | 198/250 [05:06<01:23, 1.61s/it]
Training 1/1 epoch (loss 2.5891): 80%|ββββββββ | 199/250 [05:06<01:09, 1.36s/it]
Training 1/1 epoch (loss 2.5742): 80%|ββββββββ | 199/250 [05:08<01:09, 1.36s/it]
Training 1/1 epoch (loss 2.5742): 80%|ββββββββ | 200/250 [05:08<01:17, 1.56s/it]
Training 1/1 epoch (loss 2.4963): 80%|ββββββββ | 200/250 [05:10<01:17, 1.56s/it]
Training 1/1 epoch (loss 2.4963): 80%|ββββββββ | 201/250 [05:10<01:17, 1.57s/it]
Training 1/1 epoch (loss 2.6719): 80%|ββββββββ | 201/250 [05:11<01:17, 1.57s/it]
Training 1/1 epoch (loss 2.6719): 81%|ββββββββ | 202/250 [05:11<01:08, 1.42s/it]
Training 1/1 epoch (loss 2.5895): 81%|ββββββββ | 202/250 [05:12<01:08, 1.42s/it]
Training 1/1 epoch (loss 2.5895): 81%|ββββββββ | 203/250 [05:12<01:05, 1.39s/it]
Training 1/1 epoch (loss 2.6855): 81%|ββββββββ | 203/250 [05:14<01:05, 1.39s/it]
Training 1/1 epoch (loss 2.6855): 82%|βββββββββ | 204/250 [05:14<01:08, 1.49s/it]
Training 1/1 epoch (loss 2.5880): 82%|βββββββββ | 204/250 [05:15<01:08, 1.49s/it]
Training 1/1 epoch (loss 2.5880): 82%|βββββββββ | 205/250 [05:15<01:04, 1.44s/it]
Training 1/1 epoch (loss 2.8415): 82%|βββββββββ | 205/250 [05:17<01:04, 1.44s/it]
Training 1/1 epoch (loss 2.8415): 82%|βββββββββ | 206/250 [05:17<01:07, 1.53s/it]
Training 1/1 epoch (loss 2.7065): 82%|βββββββββ | 206/250 [05:18<01:07, 1.53s/it]
Training 1/1 epoch (loss 2.7065): 83%|βββββββββ | 207/250 [05:18<00:57, 1.33s/it]
Training 1/1 epoch (loss 2.5463): 83%|βββββββββ | 207/250 [05:20<00:57, 1.33s/it]
Training 1/1 epoch (loss 2.5463): 83%|βββββββββ | 208/250 [05:20<01:09, 1.65s/it]
Training 1/1 epoch (loss 2.6945): 83%|βββββββββ | 208/250 [05:23<01:09, 1.65s/it]
Training 1/1 epoch (loss 2.6945): 84%|βββββββββ | 209/250 [05:23<01:17, 1.89s/it]
Training 1/1 epoch (loss 2.5531): 84%|βββββββββ | 209/250 [05:23<01:17, 1.89s/it]
Training 1/1 epoch (loss 2.5531): 84%|βββββββββ | 210/250 [05:23<00:58, 1.45s/it]
Training 1/1 epoch (loss 2.6629): 84%|βββββββββ | 210/250 [05:24<00:58, 1.45s/it]
Training 1/1 epoch (loss 2.6629): 84%|βββββββββ | 211/250 [05:24<00:52, 1.36s/it]
Training 1/1 epoch (loss 2.7072): 84%|βββββββββ | 211/250 [05:26<00:52, 1.36s/it]
Training 1/1 epoch (loss 2.7072): 85%|βββββββββ | 212/250 [05:26<00:55, 1.46s/it]
Training 1/1 epoch (loss 2.5986): 85%|βββββββββ | 212/250 [05:27<00:55, 1.46s/it]
Training 1/1 epoch (loss 2.5986): 85%|βββββββββ | 213/250 [05:27<00:45, 1.23s/it]
Training 1/1 epoch (loss 2.7702): 85%|βββββββββ | 213/250 [05:29<00:45, 1.23s/it]
Training 1/1 epoch (loss 2.7702): 86%|βββββββββ | 214/250 [05:29<00:57, 1.59s/it]
Training 1/1 epoch (loss 2.6969): 86%|βββββββββ | 214/250 [05:31<00:57, 1.59s/it]
Training 1/1 epoch (loss 2.6969): 86%|βββββββββ | 215/250 [05:31<01:01, 1.76s/it]
Training 1/1 epoch (loss 2.6419): 86%|βββββββββ | 215/250 [05:32<01:01, 1.76s/it]
Training 1/1 epoch (loss 2.6419): 86%|βββββββββ | 216/250 [05:32<00:49, 1.45s/it]
Training 1/1 epoch (loss 2.7658): 86%|βββββββββ | 216/250 [05:35<00:49, 1.45s/it]
Training 1/1 epoch (loss 2.7658): 87%|βββββββββ | 217/250 [05:35<00:57, 1.76s/it]
Training 1/1 epoch (loss 2.6563): 87%|βββββββββ | 217/250 [05:36<00:57, 1.76s/it]
Training 1/1 epoch (loss 2.6563): 87%|βββββββββ | 218/250 [05:36<00:53, 1.67s/it]
Training 1/1 epoch (loss 2.8032): 87%|βββββββββ | 218/250 [05:37<00:53, 1.67s/it]
Training 1/1 epoch (loss 2.8032): 88%|βββββββββ | 219/250 [05:37<00:47, 1.53s/it]
Training 1/1 epoch (loss 2.7998): 88%|βββββββββ | 219/250 [05:39<00:47, 1.53s/it]
Training 1/1 epoch (loss 2.7998): 88%|βββββββββ | 220/250 [05:39<00:45, 1.51s/it]
Training 1/1 epoch (loss 2.6268): 88%|βββββββββ | 220/250 [05:40<00:45, 1.51s/it]
Training 1/1 epoch (loss 2.6268): 88%|βββββββββ | 221/250 [05:40<00:41, 1.42s/it]
Training 1/1 epoch (loss 2.5846): 88%|βββββββββ | 221/250 [05:41<00:41, 1.42s/it]
Training 1/1 epoch (loss 2.5846): 89%|βββββββββ | 222/250 [05:41<00:33, 1.19s/it]
Training 1/1 epoch (loss 2.5619): 89%|βββββββββ | 222/250 [05:43<00:33, 1.19s/it]
Training 1/1 epoch (loss 2.5619): 89%|βββββββββ | 223/250 [05:43<00:42, 1.56s/it]
Training 1/1 epoch (loss 2.6440): 89%|βββββββββ | 223/250 [05:45<00:42, 1.56s/it]
Training 1/1 epoch (loss 2.6440): 90%|βββββββββ | 224/250 [05:45<00:40, 1.55s/it]
Training 1/1 epoch (loss 2.6184): 90%|βββββββββ | 224/250 [05:45<00:40, 1.55s/it]
Training 1/1 epoch (loss 2.6184): 90%|βββββββββ | 225/250 [05:45<00:33, 1.33s/it]
Training 1/1 epoch (loss 2.5425): 90%|βββββββββ | 225/250 [05:47<00:33, 1.33s/it]
Training 1/1 epoch (loss 2.5425): 90%|βββββββββ | 226/250 [05:47<00:34, 1.44s/it]
Training 1/1 epoch (loss 2.2281): 90%|βββββββββ | 226/250 [05:48<00:34, 1.44s/it]
Training 1/1 epoch (loss 2.2281): 91%|βββββββββ | 227/250 [05:48<00:28, 1.25s/it]
Training 1/1 epoch (loss 2.7459): 91%|βββββββββ | 227/250 [05:49<00:28, 1.25s/it]
Training 1/1 epoch (loss 2.7459): 91%|βββββββββ | 228/250 [05:49<00:26, 1.22s/it]
Training 1/1 epoch (loss 2.4583): 91%|βββββββββ | 228/250 [05:51<00:26, 1.22s/it]
Training 1/1 epoch (loss 2.4583): 92%|ββββββββββ| 229/250 [05:51<00:32, 1.53s/it]
Training 1/1 epoch (loss 2.8023): 92%|ββββββββββ| 229/250 [05:53<00:32, 1.53s/it]
Training 1/1 epoch (loss 2.8023): 92%|ββββββββββ| 230/250 [05:53<00:29, 1.46s/it]
Training 1/1 epoch (loss 2.7741): 92%|ββββββββββ| 230/250 [05:55<00:29, 1.46s/it]
Training 1/1 epoch (loss 2.7741): 92%|ββββββββββ| 231/250 [05:55<00:33, 1.74s/it]
Training 1/1 epoch (loss 2.5439): 92%|ββββββββββ| 231/250 [05:57<00:33, 1.74s/it]
Training 1/1 epoch (loss 2.5439): 93%|ββββββββββ| 232/250 [05:57<00:31, 1.78s/it]
Training 1/1 epoch (loss 2.5555): 93%|ββββββββββ| 232/250 [05:57<00:31, 1.78s/it]
Training 1/1 epoch (loss 2.5555): 93%|ββββββββββ| 233/250 [05:57<00:23, 1.40s/it]
Training 1/1 epoch (loss 2.6544): 93%|ββββββββββ| 233/250 [05:58<00:23, 1.40s/it]
Training 1/1 epoch (loss 2.6544): 94%|ββββββββββ| 234/250 [05:58<00:20, 1.29s/it]
Training 1/1 epoch (loss 2.4306): 94%|ββββββββββ| 234/250 [06:01<00:20, 1.29s/it]
Training 1/1 epoch (loss 2.4306): 94%|ββββββββββ| 235/250 [06:01<00:23, 1.59s/it]
Training 1/1 epoch (loss 2.5172): 94%|ββββββββββ| 235/250 [06:01<00:23, 1.59s/it]
Training 1/1 epoch (loss 2.5172): 94%|ββββββββββ| 236/250 [06:01<00:18, 1.33s/it]
Training 1/1 epoch (loss 2.5500): 94%|ββββββββββ| 236/250 [06:04<00:18, 1.33s/it]
Training 1/1 epoch (loss 2.5500): 95%|ββββββββββ| 237/250 [06:04<00:21, 1.62s/it]
Training 1/1 epoch (loss 2.5297): 95%|ββββββββββ| 237/250 [06:06<00:21, 1.62s/it]
Training 1/1 epoch (loss 2.5297): 95%|ββββββββββ| 238/250 [06:06<00:20, 1.69s/it]
Training 1/1 epoch (loss 2.5566): 95%|ββββββββββ| 238/250 [06:07<00:20, 1.69s/it]
Training 1/1 epoch (loss 2.5566): 96%|ββββββββββ| 239/250 [06:07<00:16, 1.54s/it]
Training 1/1 epoch (loss 2.4001): 96%|ββββββββββ| 239/250 [06:09<00:16, 1.54s/it]
Training 1/1 epoch (loss 2.4001): 96%|ββββββββββ| 240/250 [06:09<00:17, 1.74s/it]
Training 1/1 epoch (loss 2.6095): 96%|ββββββββββ| 240/250 [06:11<00:17, 1.74s/it]
Training 1/1 epoch (loss 2.6095): 96%|ββββββββββ| 241/250 [06:11<00:15, 1.67s/it]
Training 1/1 epoch (loss 2.5228): 96%|ββββββββββ| 241/250 [06:13<00:15, 1.67s/it]
Training 1/1 epoch (loss 2.5228): 97%|ββββββββββ| 242/250 [06:13<00:15, 1.90s/it]
Training 1/1 epoch (loss 2.6822): 97%|ββββββββββ| 242/250 [06:15<00:15, 1.90s/it]
Training 1/1 epoch (loss 2.6822): 97%|ββββββββββ| 243/250 [06:15<00:13, 1.90s/it]
Training 1/1 epoch (loss 2.6131): 97%|ββββββββββ| 243/250 [06:15<00:13, 1.90s/it]
Training 1/1 epoch (loss 2.6131): 98%|ββββββββββ| 244/250 [06:15<00:08, 1.48s/it]
Training 1/1 epoch (loss 2.6554): 98%|ββββββββββ| 244/250 [06:17<00:08, 1.48s/it]
Training 1/1 epoch (loss 2.6554): 98%|ββββββββββ| 245/250 [06:17<00:08, 1.63s/it]
Training 1/1 epoch (loss 2.6773): 98%|ββββββββββ| 245/250 [06:20<00:08, 1.63s/it]
Training 1/1 epoch (loss 2.6773): 98%|ββββββββββ| 246/250 [06:20<00:07, 1.85s/it]
Training 1/1 epoch (loss 2.5804): 98%|ββββββββββ| 246/250 [06:20<00:07, 1.85s/it]
Training 1/1 epoch (loss 2.5804): 99%|ββββββββββ| 247/250 [06:20<00:04, 1.51s/it]
Training 1/1 epoch (loss 2.7035): 99%|ββββββββββ| 247/250 [06:22<00:04, 1.51s/it]
Training 1/1 epoch (loss 2.7035): 99%|ββββββββββ| 248/250 [06:22<00:03, 1.62s/it]
Training 1/1 epoch (loss 2.5182): 99%|ββββββββββ| 248/250 [06:24<00:03, 1.62s/it]
Training 1/1 epoch (loss 2.5182): 100%|ββββββββββ| 249/250 [06:24<00:01, 1.58s/it]
Training 1/1 epoch (loss 2.6589): 100%|ββββββββββ| 249/250 [06:24<00:01, 1.58s/it]
Training 1/1 epoch (loss 2.6589): 100%|ββββββββββ| 250/250 [06:24<00:00, 1.26s/it]
Training 1/1 epoch (loss 2.6589): 100%|ββββββββββ| 250/250 [06:24<00:00, 1.54s/it] |
|
tokenizer config file saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000-Q2-2000/tokenizer_config.json |
|
Special tokens file saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000-Q2-2000/special_tokens_map.json |
|
wandb: ERROR Problem finishing run |
|
Exception ignored in atexit callback: <bound method rank_zero_only.<locals>.wrapper of <safe_rlhf.logger.Logger object at 0x1551f11f2210>> |
|
Traceback (most recent call last): |
|
File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/utils.py", line 212, in wrapper |
|
return func(*args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^ |
|
File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/logger.py", line 183, in close |
|
self.wandb.finish() |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper |
|
return func(self, *args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 503, in wrapper |
|
return func(self, *args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 451, in wrapper |
|
return func(self, *args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2309, in finish |
|
return self._finish(exit_code) |
|
^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper |
|
return func(self, *args, **kwargs) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2337, in _finish |
|
self._atexit_cleanup(exit_code=exit_code) |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2550, in _atexit_cleanup |
|
self._on_finish() |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2806, in _on_finish |
|
wait_with_progress( |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress |
|
return wait_all_with_progress( |
|
^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress |
|
return asyncio_compat.run(progress_loop_with_timeout) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_compat.py", line 27, in run |
|
future = executor.submit(runner.run, fn) |
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/concurrent/futures/thread.py", line 169, in submit |
|
raise RuntimeError( |
|
RuntimeError: cannot schedule new futures after interpreter shutdown |
|
|