Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- arguments.yaml +94 -0
- config.json +0 -0
- environ.txt +62 -0
- preprocessor_config.json +28 -0
- processor_config.json +5 -0
- pytorch_model.bin +3 -0
- script.sh +48 -0
- special_tokens_map.json +37 -0
- tokenizer.json +0 -0
- tokenizer_config.json +0 -0
- wandb/debug-internal.log +20 -0
- wandb/debug.log +33 -0
- wandb/run-20241023_090557-paei5sn7/files/config.yaml +143 -0
- wandb/run-20241023_090557-paei5sn7/files/output.log +261 -0
- wandb/run-20241023_090557-paei5sn7/files/requirements.txt +230 -0
- wandb/run-20241023_090557-paei5sn7/files/wandb-metadata.json +102 -0
- wandb/run-20241023_090557-paei5sn7/files/wandb-summary.json +1 -0
- wandb/run-20241023_090557-paei5sn7/logs/debug-internal.log +20 -0
- wandb/run-20241023_090557-paei5sn7/logs/debug.log +33 -0
- wandb/run-20241023_090557-paei5sn7/run-paei5sn7.wandb +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
wandb/run-20241023_090557-paei5sn7/run-paei5sn7.wandb filter=lfs diff=lfs merge=lfs -text
|
arguments.yaml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bnb_cfgs:
|
2 |
+
bnb_4bit_compute_dtype: float16
|
3 |
+
bnb_4bit_quant_type: nf4
|
4 |
+
bnb_4bit_use_double_quant: true
|
5 |
+
load_in_4bit: true
|
6 |
+
load_in_8bit: false
|
7 |
+
use_bnb: false
|
8 |
+
data_cfgs:
|
9 |
+
eval_data_files: null
|
10 |
+
eval_datasets: null
|
11 |
+
eval_optional_args: []
|
12 |
+
eval_size: null
|
13 |
+
eval_split: null
|
14 |
+
eval_subset: null
|
15 |
+
eval_template: null
|
16 |
+
ptx_data_files: null
|
17 |
+
ptx_datasets: null
|
18 |
+
ptx_optional_args: []
|
19 |
+
ptx_size: null
|
20 |
+
ptx_split: null
|
21 |
+
ptx_subset: null
|
22 |
+
ptx_template: null
|
23 |
+
train_data_files: ti2ti_llf_prompt_only_tokenize.pt
|
24 |
+
train_datasets: /data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs
|
25 |
+
train_optional_args: []
|
26 |
+
train_size: 5000
|
27 |
+
train_split: null
|
28 |
+
train_subset: null
|
29 |
+
train_template: spavl_ti2ti
|
30 |
+
logger_cfgs:
|
31 |
+
cache_dir: null
|
32 |
+
log_project: align-anything
|
33 |
+
log_run_name: ppo
|
34 |
+
log_type: wandb
|
35 |
+
output_dir: ../outputs/ppo_ti2ti_llf_1023_step_800
|
36 |
+
save_interval: 30.0
|
37 |
+
lora_cfgs:
|
38 |
+
inference_mode: false
|
39 |
+
lora_alpha: 16
|
40 |
+
lora_dropout: 0.1
|
41 |
+
r: 16
|
42 |
+
save_full_model: true
|
43 |
+
target_modules:
|
44 |
+
- q_proj
|
45 |
+
- v_proj
|
46 |
+
task_type: TaskType.CAUSAL_LM
|
47 |
+
use_lora: false
|
48 |
+
model_cfgs:
|
49 |
+
actor_model_name_or_path: /data/align-anything/hantao/models/0916_ti_to_ti_sft
|
50 |
+
model_max_length: 2048
|
51 |
+
repetition_penalty: 1.0
|
52 |
+
reward_critic_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800
|
53 |
+
reward_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800
|
54 |
+
temperature: 1.0
|
55 |
+
top_p: 1.0
|
56 |
+
trust_remote_code: true
|
57 |
+
special_tokens: null
|
58 |
+
train_cfgs:
|
59 |
+
actor_gradient_checkpointing: true
|
60 |
+
actor_lr: 1.0e-05
|
61 |
+
actor_lr_scheduler_type: cosine
|
62 |
+
actor_lr_warmup_ratio: 0.03
|
63 |
+
actor_weight_decay: 0.01
|
64 |
+
adam_betas:
|
65 |
+
- 0.9
|
66 |
+
- 0.95
|
67 |
+
bf16: true
|
68 |
+
clip_range_ratio: 0.2
|
69 |
+
clip_range_score: 50.0
|
70 |
+
clip_range_value: 5.0
|
71 |
+
critic_gradient_checkpointing: true
|
72 |
+
critic_lr: 5.0e-06
|
73 |
+
critic_lr_scheduler_type: constant
|
74 |
+
critic_lr_warmup_ratio: 0.03
|
75 |
+
critic_weight_decay: 0.0
|
76 |
+
ds_cfgs: ds_z3_config.json
|
77 |
+
epochs: 3
|
78 |
+
eval_interval: 10
|
79 |
+
eval_strategy: epoch
|
80 |
+
fp16: false
|
81 |
+
freeze_language_model: true
|
82 |
+
freeze_mm_proj: true
|
83 |
+
freeze_vision_tower: false
|
84 |
+
gae_lambda: 0.95
|
85 |
+
gamma: 1.0
|
86 |
+
gradient_accumulation_steps: 2
|
87 |
+
kl_coeff: 0.02
|
88 |
+
normalize_reward: false
|
89 |
+
per_device_eval_batch_size: 8
|
90 |
+
per_device_prompt_batch_size: 8
|
91 |
+
per_device_train_batch_size: 8
|
92 |
+
ptx_coeff: 16.0
|
93 |
+
seed: 42
|
94 |
+
update_iters: 1
|
config.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
environ.txt
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CONDA_DEFAULT_ENV=hantao_cham
|
2 |
+
CONDA_EXE=/home/align-anything/miniconda3/bin/conda
|
3 |
+
CONDA_PREFIX=/home/align-anything/miniconda3/envs/hantao_cham
|
4 |
+
CONDA_PREFIX_1=/home/align-anything/miniconda3
|
5 |
+
CONDA_PREFIX_2=/home/align-anything/miniconda3/envs/hantao_cham
|
6 |
+
CONDA_PREFIX_3=/home/align-anything/miniconda3/envs/hantao_stable
|
7 |
+
CONDA_PREFIX_4=/home/align-anything/miniconda3/envs/hantao_cham
|
8 |
+
CONDA_PREFIX_5=/home/align-anything/miniconda3/envs/hantao_stable
|
9 |
+
CONDA_PREFIX_6=/home/align-anything/miniconda3/envs/hantao_cham
|
10 |
+
CONDA_PREFIX_7=/home/align-anything/miniconda3/envs/hantao_stable
|
11 |
+
CONDA_PROMPT_MODIFIER=(hantao_cham)
|
12 |
+
CONDA_PYTHON_EXE=/home/align-anything/miniconda3/bin/python
|
13 |
+
CONDA_SHLVL=8
|
14 |
+
CRASHDIR=/etc/ShellCrash
|
15 |
+
CROSS_RANK=0
|
16 |
+
CROSS_SIZE=1
|
17 |
+
CUDA_MODULE_LOADING=LAZY
|
18 |
+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
19 |
+
DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/2000/bus
|
20 |
+
HOME=/home/align-anything
|
21 |
+
LANG=en_US.UTF-8
|
22 |
+
LD_LIBRARY_PATH=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/../../lib64:
|
23 |
+
LESSCLOSE=/usr/bin/lesspipe %s %s
|
24 |
+
LESSOPEN=| /usr/bin/lesspipe %s
|
25 |
+
LOCAL_RANK=0
|
26 |
+
LOCAL_SIZE=8
|
27 |
+
LOGLEVEL=WARNING
|
28 |
+
LOGNAME=align-anything
|
29 |
+
LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
|
30 |
+
MASTER_ADDR=127.0.0.1
|
31 |
+
MASTER_PORT=23139
|
32 |
+
MOTD_SHOWN=pam
|
33 |
+
OLDPWD=/data/align-anything/hantao/align-anything/projects/text_image_to_text_image
|
34 |
+
PATH=/home/align-anything/miniconda3/envs/hantao_cham/bin:/home/align-anything/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
|
35 |
+
PWD=/data/align-anything/hantao/align-anything/scripts
|
36 |
+
PYGAME_HIDE_SUPPORT_PROMPT=1
|
37 |
+
PYTHONHASHSEED=42
|
38 |
+
PYTHONPATH=/data/align-anything/hantao/align-anything
|
39 |
+
QT_QPA_FONTDIR=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/qt/fonts
|
40 |
+
QT_QPA_PLATFORM_PLUGIN_PATH=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/qt/plugins
|
41 |
+
RANK=0
|
42 |
+
SHELL=/bin/bash
|
43 |
+
SHLVL=3
|
44 |
+
SSH_CLIENT=111.205.230.212 43947 30600
|
45 |
+
SSH_CONNECTION=111.205.230.212 44215 10.10.212.196 30600
|
46 |
+
SSH_TTY=/dev/pts/1
|
47 |
+
TERM=screen
|
48 |
+
TMUX=/tmp/tmux-2000/default,53635,1
|
49 |
+
TMUX_PANE=%1
|
50 |
+
USER=align-anything
|
51 |
+
WANDB_API_KEY=7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33
|
52 |
+
WANDB_MODE=online
|
53 |
+
WANDB_SERVICE=2-1071579-tcp-localhost-45673
|
54 |
+
WORLD_SIZE=8
|
55 |
+
XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
|
56 |
+
XDG_RUNTIME_DIR=/run/user/2000
|
57 |
+
XDG_SESSION_CLASS=user
|
58 |
+
XDG_SESSION_ID=15
|
59 |
+
XDG_SESSION_TYPE=tty
|
60 |
+
_=/home/align-anything/miniconda3/envs/hantao_cham/bin/deepspeed
|
61 |
+
_CE_CONDA=
|
62 |
+
_CE_M=
|
preprocessor_config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"crop_size": {
|
3 |
+
"height": 512,
|
4 |
+
"width": 512
|
5 |
+
},
|
6 |
+
"do_center_crop": true,
|
7 |
+
"do_convert_rgb": true,
|
8 |
+
"do_normalize": true,
|
9 |
+
"do_rescale": true,
|
10 |
+
"do_resize": true,
|
11 |
+
"image_mean": [
|
12 |
+
1.0,
|
13 |
+
1.0,
|
14 |
+
1.0
|
15 |
+
],
|
16 |
+
"image_processor_type": "ChameleonImageProcessor",
|
17 |
+
"image_std": [
|
18 |
+
1.0,
|
19 |
+
1.0,
|
20 |
+
1.0
|
21 |
+
],
|
22 |
+
"processor_class": "ChameleonProcessor",
|
23 |
+
"resample": 1,
|
24 |
+
"rescale_factor": 0.0078,
|
25 |
+
"size": {
|
26 |
+
"shortest_edge": 512
|
27 |
+
}
|
28 |
+
}
|
processor_config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"image_seq_length": 1024,
|
3 |
+
"image_token": "<image>",
|
4 |
+
"processor_class": "ChameleonProcessor"
|
5 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e463e8c312e3fcebc9ff47999f2efbe72a12805c69e3ba885a8b38b4ebe8d478
|
3 |
+
size 14165009930
|
script.sh
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
#
|
3 |
+
# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# ==============================================================================
|
17 |
+
|
18 |
+
# Initialize variables
|
19 |
+
# For wandb online logging
|
20 |
+
export WANDB_API_KEY="7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33"
|
21 |
+
# Source the setup script
|
22 |
+
# source ./setup.sh
|
23 |
+
|
24 |
+
export WANDB_MODE=online
|
25 |
+
|
26 |
+
ACTOR_MODEL_NAME_OR_PATH="/data/align-anything/hantao/models/0916_ti_to_ti_sft"
|
27 |
+
CRITIC_MODEL_NAME_OR_PATH="/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800"
|
28 |
+
REWARD_MODEL_NAME_OR_PATH="/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800"
|
29 |
+
TRAIN_DATASETS="/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs"
|
30 |
+
PTX_DATASETS="/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs"
|
31 |
+
OUTPUT_DIR="../outputs/ppo_ti2ti_llf_1023_step_800"
|
32 |
+
|
33 |
+
# Source the setup script
|
34 |
+
source ./setup.sh
|
35 |
+
|
36 |
+
# Execute deepspeed command
|
37 |
+
deepspeed \
|
38 |
+
--master_port ${MASTER_PORT} \
|
39 |
+
--module align_anything.trainers.text_image_to_text_image.ppo \
|
40 |
+
--actor_model_name_or_path ${ACTOR_MODEL_NAME_OR_PATH} \
|
41 |
+
--reward_model_name_or_path ${REWARD_MODEL_NAME_OR_PATH} \
|
42 |
+
--reward_critic_model_name_or_path ${CRITIC_MODEL_NAME_OR_PATH} \
|
43 |
+
--train_datasets ${TRAIN_DATASETS} \
|
44 |
+
--train_template spavl_ti2ti \
|
45 |
+
--train_data_files ti2ti_llf_prompt_only_tokenize.pt \
|
46 |
+
--output_dir ${OUTPUT_DIR} \
|
47 |
+
--save_interval 30
|
48 |
+
|
special_tokens_map.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<pad>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"sep_token": {
|
24 |
+
"content": "<reserved08706>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"unk_token": {
|
31 |
+
"content": "<unk>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
}
|
37 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
wandb/debug-internal.log
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-10-23T09:05:57.421439083Z","level":"INFO","msg":"using version","core version":"0.18.3"}
|
2 |
+
{"time":"2024-10-23T09:05:57.421466903Z","level":"INFO","msg":"created symlink","path":"../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug-core.log"}
|
3 |
+
{"time":"2024-10-23T09:05:57.425862224Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
|
4 |
+
{"time":"2024-10-23T09:05:57.446650446Z","level":"INFO","msg":"created new stream","id":"paei5sn7"}
|
5 |
+
{"time":"2024-10-23T09:05:57.44668639Z","level":"INFO","msg":"stream: started","id":"paei5sn7"}
|
6 |
+
{"time":"2024-10-23T09:05:57.446720716Z","level":"INFO","msg":"handler: started","stream_id":{"value":"paei5sn7"}}
|
7 |
+
{"time":"2024-10-23T09:05:57.446711826Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"paei5sn7"}}
|
8 |
+
{"time":"2024-10-23T09:05:57.446734515Z","level":"INFO","msg":"sender: started","stream_id":{"value":"paei5sn7"}}
|
9 |
+
{"time":"2024-10-23T09:05:58.065915529Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
10 |
+
{"time":"2024-10-23T09:05:58.068726401Z","level":"INFO","msg":"Starting system monitor"}
|
11 |
+
{"time":"2024-10-23T11:05:25.66533688Z","level":"INFO","msg":"Stopping system monitor"}
|
12 |
+
{"time":"2024-10-23T11:05:25.691953146Z","level":"INFO","msg":"Stopped system monitor"}
|
13 |
+
{"time":"2024-10-23T11:05:26.186981401Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
|
14 |
+
{"time":"2024-10-23T11:05:26.187015556Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
|
15 |
+
{"time":"2024-10-23T11:05:27.289810318Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
16 |
+
{"time":"2024-10-23T11:05:28.84134155Z","level":"INFO","msg":"stream: closing","id":"paei5sn7"}
|
17 |
+
{"time":"2024-10-23T11:05:28.841377348Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"paei5sn7"}}
|
18 |
+
{"time":"2024-10-23T11:05:28.841437021Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"paei5sn7"}}
|
19 |
+
{"time":"2024-10-23T11:05:28.841525942Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"paei5sn7"}}
|
20 |
+
{"time":"2024-10-23T11:05:28.842923992Z","level":"INFO","msg":"stream: closed","id":"paei5sn7"}
|
wandb/debug.log
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
|
2 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Configure stats pid to 1071579
|
3 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
|
4 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
|
5 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
|
6 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
|
7 |
+
2024-10-23 09:05:57,409 WARNING MainThread:1071579 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.ppo
|
8 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.ppo'}
|
9 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Applying login settings: {}
|
10 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_init.py:_log_setup():532] Logging user logs to ../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug.log
|
11 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_init.py:_log_setup():533] Logging internal logs to ../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug-internal.log
|
12 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_init.py:init():617] calling init triggers
|
13 |
+
2024-10-23 09:05:57,410 INFO MainThread:1071579 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
|
14 |
+
config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_prompt_batch_size': 8, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 2, 'actor_gradient_checkpointing': True, 'critic_gradient_checkpointing': True, 'actor_lr': 1e-05, 'actor_lr_scheduler_type': 'cosine', 'actor_lr_warmup_ratio': 0.03, 'actor_weight_decay': 0.01, 'critic_lr': 5e-06, 'critic_lr_scheduler_type': 'constant', 'critic_lr_warmup_ratio': 0.03, 'critic_weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'kl_coeff': 0.02, 'clip_range_ratio': 0.2, 'clip_range_score': 50.0, 'clip_range_value': 5.0, 'ptx_coeff': 16.0, 'gamma': 1.0, 'gae_lambda': 0.95, 'normalize_reward': False, 'update_iters': 1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs', 'train_template': 'spavl_ti2ti', 'train_size': 5000, 'train_split': None, 'train_subset': None, 'train_data_files': 'ti2ti_llf_prompt_only_tokenize.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': [], 'ptx_datasets': None, 'ptx_template': None, 'ptx_size': None, 'ptx_subset': None, 'ptx_split': None, 'ptx_data_files': None, 'ptx_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'ppo', 'output_dir': '../outputs/ppo_ti2ti_llf_1023_step_800', 'cache_dir': None, 'save_interval': 30.0}, 'model_cfgs': {'actor_model_name_or_path': '/data/align-anything/hantao/models/0916_ti_to_ti_sft', 'reward_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800', 'reward_critic_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800', 'trust_remote_code': True, 'model_max_length': 2048, 'temperature': 1.0, 'top_p': 1.0, 'repetition_penalty': 1.0}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': None}
|
15 |
+
2024-10-23 09:05:57,410 INFO MainThread:1071579 [wandb_init.py:init():667] starting backend
|
16 |
+
2024-10-23 09:05:57,410 INFO MainThread:1071579 [wandb_init.py:init():671] sending inform_init request
|
17 |
+
2024-10-23 09:05:57,414 INFO MainThread:1071579 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
18 |
+
2024-10-23 09:05:57,415 INFO MainThread:1071579 [wandb_init.py:init():684] backend started and connected
|
19 |
+
2024-10-23 09:05:57,418 INFO MainThread:1071579 [wandb_init.py:init():779] updated telemetry
|
20 |
+
2024-10-23 09:05:57,429 INFO MainThread:1071579 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
|
21 |
+
2024-10-23 09:05:58,062 INFO MainThread:1071579 [wandb_init.py:init():863] starting run threads in backend
|
22 |
+
2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_console_start():2465] atexit reg
|
23 |
+
2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_redirect():2313] redirect: wrap_raw
|
24 |
+
2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_redirect():2378] Wrapping output streams.
|
25 |
+
2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_redirect():2403] Redirects installed.
|
26 |
+
2024-10-23 09:05:58,197 INFO MainThread:1071579 [wandb_init.py:init():907] run started, returning control to user process
|
27 |
+
2024-10-23 11:05:25,658 INFO MainThread:1071579 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/paei5sn7
|
28 |
+
2024-10-23 11:05:25,662 INFO MainThread:1071579 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
|
29 |
+
2024-10-23 11:05:25,664 INFO MainThread:1071579 [wandb_run.py:_restore():2410] restore
|
30 |
+
2024-10-23 11:05:25,664 INFO MainThread:1071579 [wandb_run.py:_restore():2416] restore done
|
31 |
+
2024-10-23 11:05:28,807 INFO MainThread:1071579 [wandb_run.py:_footer_history_summary_info():4049] rendering history
|
32 |
+
2024-10-23 11:05:28,809 INFO MainThread:1071579 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
|
33 |
+
2024-10-23 11:05:28,839 INFO MainThread:1071579 [wandb_run.py:_footer_sync_info():4008] logging synced files
|
wandb/run-20241023_090557-paei5sn7/files/config.yaml
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_wandb:
|
2 |
+
value:
|
3 |
+
cli_version: 0.18.3
|
4 |
+
m: []
|
5 |
+
python_version: 3.11.10
|
6 |
+
t:
|
7 |
+
"1":
|
8 |
+
- 1
|
9 |
+
- 11
|
10 |
+
- 41
|
11 |
+
- 49
|
12 |
+
- 51
|
13 |
+
- 55
|
14 |
+
- 71
|
15 |
+
- 83
|
16 |
+
- 98
|
17 |
+
- 105
|
18 |
+
"2":
|
19 |
+
- 1
|
20 |
+
- 11
|
21 |
+
- 41
|
22 |
+
- 49
|
23 |
+
- 51
|
24 |
+
- 55
|
25 |
+
- 71
|
26 |
+
- 83
|
27 |
+
- 98
|
28 |
+
- 105
|
29 |
+
"3":
|
30 |
+
- 2
|
31 |
+
- 13
|
32 |
+
- 16
|
33 |
+
- 23
|
34 |
+
- 55
|
35 |
+
- 61
|
36 |
+
"4": 3.11.10
|
37 |
+
"5": 0.18.3
|
38 |
+
"6": 4.44.0.dev0
|
39 |
+
"8":
|
40 |
+
- 5
|
41 |
+
"12": 0.18.3
|
42 |
+
"13": linux-x86_64
|
43 |
+
bnb_cfgs:
|
44 |
+
value:
|
45 |
+
bnb_4bit_compute_dtype: float16
|
46 |
+
bnb_4bit_quant_type: nf4
|
47 |
+
bnb_4bit_use_double_quant: true
|
48 |
+
load_in_4bit: true
|
49 |
+
load_in_8bit: false
|
50 |
+
use_bnb: false
|
51 |
+
data_cfgs:
|
52 |
+
value:
|
53 |
+
eval_data_files: null
|
54 |
+
eval_datasets: null
|
55 |
+
eval_optional_args: []
|
56 |
+
eval_size: null
|
57 |
+
eval_split: null
|
58 |
+
eval_subset: null
|
59 |
+
eval_template: null
|
60 |
+
ptx_data_files: null
|
61 |
+
ptx_datasets: null
|
62 |
+
ptx_optional_args: []
|
63 |
+
ptx_size: null
|
64 |
+
ptx_split: null
|
65 |
+
ptx_subset: null
|
66 |
+
ptx_template: null
|
67 |
+
train_data_files: ti2ti_llf_prompt_only_tokenize.pt
|
68 |
+
train_datasets: /data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs
|
69 |
+
train_optional_args: []
|
70 |
+
train_size: 5000
|
71 |
+
train_split: null
|
72 |
+
train_subset: null
|
73 |
+
train_template: spavl_ti2ti
|
74 |
+
logger_cfgs:
|
75 |
+
value:
|
76 |
+
cache_dir: null
|
77 |
+
log_project: align-anything
|
78 |
+
log_run_name: ppo
|
79 |
+
log_type: wandb
|
80 |
+
output_dir: ../outputs/ppo_ti2ti_llf_1023_step_800
|
81 |
+
save_interval: 30
|
82 |
+
lora_cfgs:
|
83 |
+
value:
|
84 |
+
inference_mode: false
|
85 |
+
lora_alpha: 16
|
86 |
+
lora_dropout: 0.1
|
87 |
+
r: 16
|
88 |
+
save_full_model: true
|
89 |
+
target_modules:
|
90 |
+
- q_proj
|
91 |
+
- v_proj
|
92 |
+
task_type: TaskType.CAUSAL_LM
|
93 |
+
use_lora: false
|
94 |
+
model_cfgs:
|
95 |
+
value:
|
96 |
+
actor_model_name_or_path: /data/align-anything/hantao/models/0916_ti_to_ti_sft
|
97 |
+
model_max_length: 2048
|
98 |
+
repetition_penalty: 1
|
99 |
+
reward_critic_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800
|
100 |
+
reward_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800
|
101 |
+
temperature: 1
|
102 |
+
top_p: 1
|
103 |
+
trust_remote_code: true
|
104 |
+
special_tokens:
|
105 |
+
value: null
|
106 |
+
train_cfgs:
|
107 |
+
value:
|
108 |
+
actor_gradient_checkpointing: true
|
109 |
+
actor_lr: 1e-05
|
110 |
+
actor_lr_scheduler_type: cosine
|
111 |
+
actor_lr_warmup_ratio: 0.03
|
112 |
+
actor_weight_decay: 0.01
|
113 |
+
adam_betas:
|
114 |
+
- 0.9
|
115 |
+
- 0.95
|
116 |
+
bf16: true
|
117 |
+
clip_range_ratio: 0.2
|
118 |
+
clip_range_score: 50
|
119 |
+
clip_range_value: 5
|
120 |
+
critic_gradient_checkpointing: true
|
121 |
+
critic_lr: 5e-06
|
122 |
+
critic_lr_scheduler_type: constant
|
123 |
+
critic_lr_warmup_ratio: 0.03
|
124 |
+
critic_weight_decay: 0
|
125 |
+
ds_cfgs: ds_z3_config.json
|
126 |
+
epochs: 3
|
127 |
+
eval_interval: 10
|
128 |
+
eval_strategy: epoch
|
129 |
+
fp16: false
|
130 |
+
freeze_language_model: true
|
131 |
+
freeze_mm_proj: true
|
132 |
+
freeze_vision_tower: false
|
133 |
+
gae_lambda: 0.95
|
134 |
+
gamma: 1
|
135 |
+
gradient_accumulation_steps: 2
|
136 |
+
kl_coeff: 0.02
|
137 |
+
normalize_reward: false
|
138 |
+
per_device_eval_batch_size: 8
|
139 |
+
per_device_prompt_batch_size: 8
|
140 |
+
per_device_train_batch_size: 8
|
141 |
+
ptx_coeff: 16
|
142 |
+
seed: 42
|
143 |
+
update_iters: 1
|
wandb/run-20241023_090557-paei5sn7/files/output.log
ADDED
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
***** Running training *****
|
2 |
+
Training 1/3 epoch: 0%| | 0/237 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
|
3 |
+
Training 1/3 epoch (reward -0.3069): 13%|██████████████████████████████▌ | 30/237 [1:00:27<12:52:22, 223.88s/it]
|
4 |
+
[2024-10-23 09:13:46,654] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
5 |
+
[2024-10-23 09:13:50,992] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
6 |
+
[2024-10-23 09:18:33,696] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
7 |
+
[2024-10-23 09:18:38,270] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
8 |
+
[2024-10-23 09:23:48,615] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
9 |
+
[2024-10-23 09:23:53,939] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
10 |
+
[2024-10-23 09:27:04,368] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
11 |
+
[2024-10-23 09:27:08,626] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
12 |
+
[2024-10-23 09:30:00,787] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
13 |
+
[2024-10-23 09:30:04,867] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
14 |
+
[2024-10-23 09:32:26,802] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
15 |
+
[2024-10-23 09:32:30,881] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
16 |
+
[2024-10-23 09:36:44,344] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
17 |
+
[2024-10-23 09:36:47,973] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
18 |
+
[2024-10-23 09:39:01,718] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
19 |
+
[2024-10-23 09:39:01,719] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[9.908858470377793e-06, 9.908858470377793e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
20 |
+
[2024-10-23 09:39:01,719] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=14.571748606590951, CurrSamplesPerSec=15.598687308172462, MemAllocated=33.2GB, MaxMemAllocated=48.11GB
|
21 |
+
[2024-10-23 09:39:05,687] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
22 |
+
[2024-10-23 09:39:05,688] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
23 |
+
[2024-10-23 09:39:05,689] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=15.239809655137655, CurrSamplesPerSec=16.432232979486614, MemAllocated=33.2GB, MaxMemAllocated=48.11GB
|
24 |
+
[2024-10-23 09:41:40,302] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
25 |
+
[2024-10-23 09:41:43,921] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
26 |
+
[2024-10-23 09:58:00,759] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
27 |
+
[2024-10-23 09:58:06,692] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
28 |
+
[2024-10-23 10:06:16,814] [WARNING] [stage3.py:2104:step] 3 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
29 |
+
[2024-10-23 10:06:22,976] [WARNING] [stage3.py:2104:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
30 |
+
Saving checkpoint at step 30 ...
|
31 |
+
Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
|
32 |
+
Saving 16-bit model...
|
33 |
+
[2024-10-23 10:06:35,014] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step15 is about to be saved!
|
34 |
+
[2024-10-23 10:06:35,016] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin, tag: global_step15
|
35 |
+
[2024-10-23 10:06:35,016] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin...
|
36 |
+
[2024-10-23 10:06:58,291] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin.
|
37 |
+
[2024-10-23 10:06:58,293] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step15 is ready now!
|
38 |
+
Model saved!
|
39 |
+
Saving 16-bit model...
|
40 |
+
[2024-10-23 10:07:09,145] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step15 is about to be saved!
|
41 |
+
[2024-10-23 10:07:09,146] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin, tag: global_step15
|
42 |
+
[2024-10-23 10:07:09,146] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin...
|
43 |
+
[2024-10-23 10:07:32,380] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin.
|
44 |
+
[2024-10-23 10:07:32,381] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step15 is ready now!
|
45 |
+
Model saved!
|
46 |
+
Model saved!
|
47 |
+
Checkpoint saved.
|
48 |
+
[2024-10-23 10:15:51,652] [WARNING] [stage3.py:2104:step] 3 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
49 |
+
[2024-10-23 10:15:57,685] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
50 |
+
[2024-10-23 10:16:30,368] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
51 |
+
[2024-10-23 10:16:33,757] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
52 |
+
[2024-10-23 10:17:06,123] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
53 |
+
[2024-10-23 10:17:09,627] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
54 |
+
[2024-10-23 10:17:35,330] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
55 |
+
[2024-10-23 10:17:38,641] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
56 |
+
[2024-10-23 10:18:10,448] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
57 |
+
[2024-10-23 10:18:10,449] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[9.470431355738257e-06, 9.470431355738257e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
58 |
+
[2024-10-23 10:18:10,450] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=14.073356193448515, CurrSamplesPerSec=18.590383641536306, MemAllocated=33.2GB, MaxMemAllocated=52.44GB
|
59 |
+
[2024-10-23 10:18:13,794] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
60 |
+
[2024-10-23 10:18:13,795] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
61 |
+
[2024-10-23 10:18:13,795] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=14.764142588014744, CurrSamplesPerSec=19.524800757549972, MemAllocated=33.2GB, MaxMemAllocated=52.44GB
|
62 |
+
[2024-10-23 10:18:45,865] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
63 |
+
[2024-10-23 10:18:49,172] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
64 |
+
[2024-10-23 10:19:24,641] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
65 |
+
[2024-10-23 10:19:27,927] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
66 |
+
[2024-10-23 10:19:54,182] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
67 |
+
[2024-10-23 10:19:57,506] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
68 |
+
[2024-10-23 10:20:51,612] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
69 |
+
[2024-10-23 10:20:54,879] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
70 |
+
[2024-10-23 10:21:17,299] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
71 |
+
[2024-10-23 10:21:20,616] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
72 |
+
[2024-10-23 10:21:42,642] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
73 |
+
[2024-10-23 10:21:45,941] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
74 |
+
[2024-10-23 10:22:12,002] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
75 |
+
[2024-10-23 10:22:15,333] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
76 |
+
[2024-10-23 10:22:36,950] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
77 |
+
[2024-10-23 10:22:40,236] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
78 |
+
[2024-10-23 10:23:01,348] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
79 |
+
[2024-10-23 10:23:01,349] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[8.70045279830626e-06, 8.70045279830626e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
80 |
+
[2024-10-23 10:23:01,350] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=15.461989587711498, CurrSamplesPerSec=19.013700639766963, MemAllocated=33.15GB, MaxMemAllocated=52.44GB
|
81 |
+
[2024-10-23 10:23:04,658] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
82 |
+
[2024-10-23 10:23:04,659] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
83 |
+
[2024-10-23 10:23:04,659] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=16.198688238179816, CurrSamplesPerSec=19.81263984833098, MemAllocated=33.15GB, MaxMemAllocated=52.44GB
|
84 |
+
Saving checkpoint at step 60 ...
|
85 |
+
Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
|
86 |
+
Saving 16-bit model...
|
87 |
+
[2024-10-23 10:23:19,118] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step30 is about to be saved!
|
88 |
+
[2024-10-23 10:23:19,120] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin, tag: global_step30
|
89 |
+
[2024-10-23 10:23:19,120] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin...
|
90 |
+
[2024-10-23 10:23:37,909] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin.
|
91 |
+
[2024-10-23 10:23:37,912] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step30 is ready now!
|
92 |
+
Model saved!
|
93 |
+
Saving 16-bit model...
|
94 |
+
[2024-10-23 10:23:49,578] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step30 is about to be saved!
|
95 |
+
[2024-10-23 10:23:49,578] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin, tag: global_step30
|
96 |
+
[2024-10-23 10:23:49,579] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin...
|
97 |
+
[2024-10-23 10:24:12,803] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin.
|
98 |
+
[2024-10-23 10:24:12,804] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step30 is ready now!
|
99 |
+
Model saved!
|
100 |
+
Model saved!
|
101 |
+
Checkpoint saved.
|
102 |
+
[2024-10-23 10:25:22,814] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
103 |
+
[2024-10-23 10:25:26,107] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
104 |
+
[2024-10-23 10:26:58,273] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
105 |
+
[2024-10-23 10:27:01,584] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
106 |
+
[2024-10-23 10:28:01,902] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[7.656028585269017e-06, 7.656028585269017e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
107 |
+
[2024-10-23 10:28:01,903] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=16.36638951494034, CurrSamplesPerSec=29.39789939491861, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
|
108 |
+
[2024-10-23 10:28:05,224] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
109 |
+
[2024-10-23 10:28:05,225] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=17.12711182391296, CurrSamplesPerSec=30.71016374643415, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
|
110 |
+
[2024-10-23 10:28:25,905] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
111 |
+
[2024-10-23 10:28:29,159] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
112 |
+
[2024-10-23 10:29:13,184] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
113 |
+
[2024-10-23 10:29:16,439] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
114 |
+
[2024-10-23 10:29:37,209] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
115 |
+
[2024-10-23 10:29:40,499] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
116 |
+
Saving checkpoint at step 90 ...
|
117 |
+
Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
|
118 |
+
Saving 16-bit model...
|
119 |
+
[2024-10-23 10:30:18,686] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step45 is about to be saved!
|
120 |
+
[2024-10-23 10:30:18,688] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin, tag: global_step45
|
121 |
+
[2024-10-23 10:30:18,688] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin...
|
122 |
+
[2024-10-23 10:30:38,714] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin.
|
123 |
+
[2024-10-23 10:30:38,715] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step45 is ready now!
|
124 |
+
Model saved!
|
125 |
+
Saving 16-bit model...
|
126 |
+
[2024-10-23 10:30:48,994] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step45 is about to be saved!
|
127 |
+
[2024-10-23 10:30:48,995] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin, tag: global_step45
|
128 |
+
[2024-10-23 10:30:48,995] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin...
|
129 |
+
[2024-10-23 10:31:12,474] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin.
|
130 |
+
[2024-10-23 10:31:12,476] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step45 is ready now!
|
131 |
+
Model saved!
|
132 |
+
Model saved!
|
133 |
+
Checkpoint saved.
|
134 |
+
[2024-10-23 10:33:08,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[6.41461888258465e-06, 6.41461888258465e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
135 |
+
[2024-10-23 10:33:08,192] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=16.840242031082227, CurrSamplesPerSec=18.082056043756573, MemAllocated=33.1GB, MaxMemAllocated=52.44GB
|
136 |
+
[2024-10-23 10:33:11,652] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
137 |
+
[2024-10-23 10:33:11,653] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=17.604303267186207, CurrSamplesPerSec=19.299485552228777, MemAllocated=33.1GB, MaxMemAllocated=52.44GB
|
138 |
+
[2024-10-23 10:37:08,335] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[5.068293368829755e-06, 5.068293368829755e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
139 |
+
[2024-10-23 10:37:08,336] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=17.145914370621718, CurrSamplesPerSec=19.351638771074402, MemAllocated=33.14GB, MaxMemAllocated=52.44GB
|
140 |
+
[2024-10-23 10:37:11,634] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
141 |
+
[2024-10-23 10:37:11,635] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=17.89246566680119, CurrSamplesPerSec=19.82526412245786, MemAllocated=33.14GB, MaxMemAllocated=52.44GB
|
142 |
+
Saving checkpoint at step 120 ...
|
143 |
+
Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
|
144 |
+
Saving 16-bit model...
|
145 |
+
[2024-10-23 10:37:26,476] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step60 is about to be saved!
|
146 |
+
[2024-10-23 10:37:26,477] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin, tag: global_step60
|
147 |
+
[2024-10-23 10:37:26,478] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin...
|
148 |
+
[2024-10-23 10:37:44,898] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin.
|
149 |
+
[2024-10-23 10:37:44,900] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step60 is ready now!
|
150 |
+
Model saved!
|
151 |
+
Saving 16-bit model...
|
152 |
+
[2024-10-23 10:37:55,791] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step60 is about to be saved!
|
153 |
+
[2024-10-23 10:37:55,792] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin, tag: global_step60
|
154 |
+
[2024-10-23 10:37:55,792] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin...
|
155 |
+
[2024-10-23 10:38:18,798] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin.
|
156 |
+
[2024-10-23 10:38:18,799] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step60 is ready now!
|
157 |
+
Model saved!
|
158 |
+
Model saved!
|
159 |
+
Checkpoint saved.
|
160 |
+
[2024-10-23 10:42:13,477] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[3.7169028483301333e-06, 3.7169028483301333e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
161 |
+
[2024-10-23 10:42:13,478] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=17.37727340710681, CurrSamplesPerSec=18.834808666541562, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
|
162 |
+
[2024-10-23 10:42:16,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
163 |
+
[2024-10-23 10:42:16,796] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=18.13780306709264, CurrSamplesPerSec=19.724826811206178, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
|
164 |
+
Saving checkpoint at step 150 ...
|
165 |
+
Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
|
166 |
+
Saving 16-bit model...
|
167 |
+
[2024-10-23 10:44:30,480] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step75 is about to be saved!
|
168 |
+
[2024-10-23 10:44:30,481] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin, tag: global_step75
|
169 |
+
[2024-10-23 10:44:30,482] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin...
|
170 |
+
[2024-10-23 10:44:50,645] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin.
|
171 |
+
[2024-10-23 10:44:50,646] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step75 is ready now!
|
172 |
+
Model saved!
|
173 |
+
Saving 16-bit model...
|
174 |
+
[2024-10-23 10:45:00,723] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step75 is about to be saved!
|
175 |
+
[2024-10-23 10:45:00,724] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin, tag: global_step75
|
176 |
+
[2024-10-23 10:45:00,725] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin...
|
177 |
+
[2024-10-23 10:45:22,429] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin.
|
178 |
+
[2024-10-23 10:45:22,430] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step75 is ready now!
|
179 |
+
Model saved!
|
180 |
+
Model saved!
|
181 |
+
Checkpoint saved.
|
182 |
+
[2024-10-23 10:47:09,650] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[2.4606737737909696e-06, 2.4606737737909696e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
183 |
+
[2024-10-23 10:47:09,651] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=17.61415636579215, CurrSamplesPerSec=18.93111536799049, MemAllocated=33.29GB, MaxMemAllocated=52.44GB
|
184 |
+
[2024-10-23 10:47:13,054] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
185 |
+
[2024-10-23 10:47:13,055] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=18.395644821730034, CurrSamplesPerSec=19.45837151531338, MemAllocated=33.29GB, MaxMemAllocated=52.44GB
|
186 |
+
[2024-10-23 10:48:21,406] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
187 |
+
[2024-10-23 10:48:24,761] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
188 |
+
[2024-10-23 10:48:45,180] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
189 |
+
[2024-10-23 10:48:48,458] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
190 |
+
[2024-10-23 10:50:43,651] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
191 |
+
[2024-10-23 10:50:46,969] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
192 |
+
[2024-10-23 10:51:07,962] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[1.3927749088052218e-06, 1.3927749088052218e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
193 |
+
[2024-10-23 10:51:07,962] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=17.746439174563342, CurrSamplesPerSec=17.855702558049163, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
|
194 |
+
[2024-10-23 10:51:11,460] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
195 |
+
[2024-10-23 10:51:11,461] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=18.529656252919512, CurrSamplesPerSec=18.696832537132607, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
|
196 |
+
Saving checkpoint at step 180 ...
|
197 |
+
Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
|
198 |
+
Saving 16-bit model...
|
199 |
+
[2024-10-23 10:51:23,210] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step90 is about to be saved!
|
200 |
+
[2024-10-23 10:51:23,211] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin, tag: global_step90
|
201 |
+
[2024-10-23 10:51:23,211] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin...
|
202 |
+
[2024-10-23 10:51:42,948] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin.
|
203 |
+
[2024-10-23 10:51:42,950] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step90 is ready now!
|
204 |
+
Model saved!
|
205 |
+
Saving 16-bit model...
|
206 |
+
[2024-10-23 10:51:51,561] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step90 is about to be saved!
|
207 |
+
[2024-10-23 10:51:51,562] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin, tag: global_step90
|
208 |
+
[2024-10-23 10:51:51,563] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin...
|
209 |
+
[2024-10-23 10:52:13,036] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin.
|
210 |
+
[2024-10-23 10:52:13,038] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step90 is ready now!
|
211 |
+
Model saved!
|
212 |
+
Model saved!
|
213 |
+
Checkpoint saved.
|
214 |
+
[2024-10-23 10:53:45,229] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
215 |
+
[2024-10-23 10:53:48,631] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
216 |
+
[2024-10-23 10:56:08,616] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[5.924074268766422e-07, 5.924074268766422e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
217 |
+
[2024-10-23 10:56:08,617] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=17.839255597575846, CurrSamplesPerSec=18.94653826925029, MemAllocated=33.12GB, MaxMemAllocated=52.44GB
|
218 |
+
[2024-10-23 10:56:11,882] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
219 |
+
[2024-10-23 10:56:11,883] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=18.63801951889686, CurrSamplesPerSec=19.80692964117057, MemAllocated=33.12GB, MaxMemAllocated=52.44GB
|
220 |
+
Saving checkpoint at step 210 ...
|
221 |
+
Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
|
222 |
+
Saving 16-bit model...
|
223 |
+
[2024-10-23 10:58:21,816] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step105 is about to be saved!
|
224 |
+
[2024-10-23 10:58:21,818] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin, tag: global_step105
|
225 |
+
[2024-10-23 10:58:21,818] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin...
|
226 |
+
[2024-10-23 10:58:38,899] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin.
|
227 |
+
[2024-10-23 10:58:38,901] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step105 is ready now!
|
228 |
+
Model saved!
|
229 |
+
Saving 16-bit model...
|
230 |
+
[2024-10-23 10:58:47,269] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step105 is about to be saved!
|
231 |
+
[2024-10-23 10:58:47,270] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin, tag: global_step105
|
232 |
+
[2024-10-23 10:58:47,270] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin...
|
233 |
+
[2024-10-23 10:59:09,672] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin.
|
234 |
+
[2024-10-23 10:59:09,674] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step105 is ready now!
|
235 |
+
Model saved!
|
236 |
+
Model saved!
|
237 |
+
Checkpoint saved.
|
238 |
+
[2024-10-23 11:00:41,653] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
239 |
+
[2024-10-23 11:00:44,969] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
240 |
+
[2024-10-23 11:01:05,305] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[1.1893092270227724e-07, 1.1893092270227724e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
241 |
+
[2024-10-23 11:01:05,306] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=17.926464030034182, CurrSamplesPerSec=18.82486069382099, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
|
242 |
+
[2024-10-23 11:01:08,570] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
243 |
+
[2024-10-23 11:01:08,571] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=18.731570052517945, CurrSamplesPerSec=19.83092777116667, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
|
244 |
+
[2024-10-23 11:01:53,444] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
245 |
+
[2024-10-23 11:01:56,743] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
246 |
+
Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
|
247 |
+
Saving 16-bit model...
|
248 |
+
[2024-10-23 11:04:33,468] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step118 is about to be saved!
|
249 |
+
[2024-10-23 11:04:33,469] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin, tag: global_step118
|
250 |
+
[2024-10-23 11:04:33,469] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin...
|
251 |
+
[2024-10-23 11:04:53,660] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin.
|
252 |
+
[2024-10-23 11:04:53,662] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step118 is ready now!
|
253 |
+
Model saved!
|
254 |
+
Saving 16-bit model...
|
255 |
+
[2024-10-23 11:05:03,419] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step118 is about to be saved!
|
256 |
+
[2024-10-23 11:05:03,420] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin, tag: global_step118
|
257 |
+
[2024-10-23 11:05:03,420] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin...
|
258 |
+
[2024-10-23 11:05:25,565] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin.
|
259 |
+
[2024-10-23 11:05:25,566] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step118 is ready now!
|
260 |
+
Model saved!
|
261 |
+
Model saved!
|
wandb/run-20241023_090557-paei5sn7/files/requirements.txt
ADDED
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
align-anything==0.0.1.dev0
|
2 |
+
nvidia-cusolver-cu12==11.4.5.107
|
3 |
+
nvidia-curand-cu12==10.3.2.106
|
4 |
+
wcwidth==0.2.13
|
5 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
6 |
+
pycparser==2.22
|
7 |
+
tokenizers==0.19.1
|
8 |
+
grpcio==1.66.2
|
9 |
+
joblib==1.4.2
|
10 |
+
virtualenv==20.26.6
|
11 |
+
pyzmq==26.2.0
|
12 |
+
cffi==1.17.1
|
13 |
+
sentencepiece==0.2.0
|
14 |
+
nvidia-nvtx-cu12==12.1.105
|
15 |
+
docker-pycreds==0.4.0
|
16 |
+
outlines==0.1.1.dev4+ga2fd35c
|
17 |
+
nvidia-nvjitlink-cu12==12.6.77
|
18 |
+
annotated-types==0.7.0
|
19 |
+
certifi==2024.8.30
|
20 |
+
interegular==0.3.3
|
21 |
+
Jinja2==3.1.4
|
22 |
+
Brotli==1.1.0
|
23 |
+
fairscale==0.4.13
|
24 |
+
gradio_client==1.4.0
|
25 |
+
opencv-python==4.6.0.66
|
26 |
+
pyarrow==17.0.0
|
27 |
+
absl-py==2.1.0
|
28 |
+
lm-format-enforcer==0.10.6
|
29 |
+
pydantic_core==2.23.4
|
30 |
+
llvmlite==0.43.0
|
31 |
+
accelerate==1.0.1
|
32 |
+
pytest-split==0.8.0
|
33 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
34 |
+
watchfiles==0.24.0
|
35 |
+
optree==0.13.0
|
36 |
+
py-cpuinfo==9.0.0
|
37 |
+
scikit-learn==1.5.2
|
38 |
+
ftfy==6.3.0
|
39 |
+
fastapi==0.115.0
|
40 |
+
psutil==6.0.0
|
41 |
+
MarkupSafe==2.1.5
|
42 |
+
nvidia-cublas-cu12==12.1.3.1
|
43 |
+
pip==24.2
|
44 |
+
websockets==12.0
|
45 |
+
tomlkit==0.12.0
|
46 |
+
torchaudio==2.4.0
|
47 |
+
huggingface-hub==0.25.2
|
48 |
+
mistral_common==1.4.4
|
49 |
+
image-reward==1.5
|
50 |
+
pyparsing==3.1.4
|
51 |
+
aiohappyeyeballs==2.4.3
|
52 |
+
click==8.1.7
|
53 |
+
httptools==0.6.1
|
54 |
+
decorator==4.4.2
|
55 |
+
tqdm==4.66.5
|
56 |
+
fonttools==4.54.1
|
57 |
+
kiwisolver==1.4.7
|
58 |
+
ruff==0.6.9
|
59 |
+
openai==1.51.2
|
60 |
+
partial-json-parser==0.2.1.1.post4
|
61 |
+
xformers==0.0.27.post2
|
62 |
+
distlib==0.3.9
|
63 |
+
GitPython==3.1.43
|
64 |
+
pytest==7.2.0
|
65 |
+
imageio==2.35.1
|
66 |
+
msgspec==0.18.6
|
67 |
+
proglog==0.1.10
|
68 |
+
yarl==1.15.0
|
69 |
+
markdown-it-py==3.0.0
|
70 |
+
PyYAML==6.0.2
|
71 |
+
xxhash==3.5.0
|
72 |
+
braceexpand==0.1.7
|
73 |
+
datasets==3.0.1
|
74 |
+
mpmath==1.3.0
|
75 |
+
distro==1.9.0
|
76 |
+
term-image==0.7.2
|
77 |
+
python-dotenv==1.0.1
|
78 |
+
semantic-version==2.10.0
|
79 |
+
multidict==6.1.0
|
80 |
+
vllm==0.6.2
|
81 |
+
sentry-sdk==2.16.0
|
82 |
+
idna==3.10
|
83 |
+
starlette==0.38.6
|
84 |
+
args==0.1.0
|
85 |
+
peft==0.13.2
|
86 |
+
librosa==0.10.2.post1
|
87 |
+
urllib3==2.2.3
|
88 |
+
python-dateutil==2.9.0.post0
|
89 |
+
pycountry==24.6.1
|
90 |
+
six==1.16.0
|
91 |
+
ffmpy==0.4.0
|
92 |
+
multiprocess==0.70.16
|
93 |
+
cycler==0.12.1
|
94 |
+
charset-normalizer==3.4.0
|
95 |
+
aiofiles==23.2.1
|
96 |
+
shellingham==1.5.4
|
97 |
+
propcache==0.2.0
|
98 |
+
lark==1.2.2
|
99 |
+
torch==2.4.0
|
100 |
+
Werkzeug==3.0.4
|
101 |
+
nvidia-cusparse-cu12==12.1.0.106
|
102 |
+
clip==0.2.0
|
103 |
+
hjson==3.1.0
|
104 |
+
diffusers==0.30.3
|
105 |
+
attrs==24.2.0
|
106 |
+
lazy_loader==0.4
|
107 |
+
numpy==1.26.4
|
108 |
+
rpds-py==0.20.0
|
109 |
+
pytz==2024.2
|
110 |
+
audioread==3.0.1
|
111 |
+
platformdirs==4.3.6
|
112 |
+
deepspeed==0.15.2
|
113 |
+
gguf==0.10.0
|
114 |
+
wandb==0.18.3
|
115 |
+
prometheus_client==0.21.0
|
116 |
+
gitdb==4.0.11
|
117 |
+
packaging==24.1
|
118 |
+
sympy==1.13.3
|
119 |
+
mutagen==1.47.0
|
120 |
+
contourpy==1.3.0
|
121 |
+
pluggy==1.5.0
|
122 |
+
python-multipart==0.0.12
|
123 |
+
soundfile==0.12.1
|
124 |
+
typer==0.12.5
|
125 |
+
timm==0.6.13
|
126 |
+
frozenlist==1.4.1
|
127 |
+
httpx==0.27.2
|
128 |
+
mmsg==0.1.dev20+g585c63a.d20241012
|
129 |
+
tiktoken==0.7.0
|
130 |
+
pydub==0.25.1
|
131 |
+
diskcache==5.6.3
|
132 |
+
einops==0.8.0
|
133 |
+
setproctitle==1.3.3
|
134 |
+
scipy==1.14.1
|
135 |
+
typing_extensions==4.12.2
|
136 |
+
httpcore==1.0.6
|
137 |
+
cfgv==3.4.0
|
138 |
+
requests==2.32.3
|
139 |
+
torchlibrosa==0.1.0
|
140 |
+
pydantic==2.9.2
|
141 |
+
torchvision==0.19.0
|
142 |
+
sniffio==1.3.1
|
143 |
+
pyairports==2.1.1
|
144 |
+
hpsv2==1.2.0
|
145 |
+
protobuf==3.20.3
|
146 |
+
wheel==0.44.0
|
147 |
+
smmap==5.0.1
|
148 |
+
zipp==3.20.2
|
149 |
+
iniconfig==2.0.0
|
150 |
+
airportsdata==20241001
|
151 |
+
clint==0.5.1
|
152 |
+
pooch==1.8.2
|
153 |
+
shortuuid==1.0.13
|
154 |
+
pycryptodomex==3.21.0
|
155 |
+
cloudpickle==3.1.0
|
156 |
+
transformers==4.44.0.dev0
|
157 |
+
regex==2024.9.11
|
158 |
+
numba==0.60.0
|
159 |
+
tzdata==2024.2
|
160 |
+
orjson==3.10.7
|
161 |
+
jsonschema-specifications==2024.10.1
|
162 |
+
safetensors==0.4.5
|
163 |
+
outlines_core==0.1.0
|
164 |
+
filelock==3.16.1
|
165 |
+
threadpoolctl==3.5.0
|
166 |
+
soxr==0.5.0.post1
|
167 |
+
nvidia-cufft-cu12==11.0.2.54
|
168 |
+
networkx==3.4.1
|
169 |
+
msgpack==1.1.0
|
170 |
+
pandas==2.2.3
|
171 |
+
align-anything==0.0.1.dev0
|
172 |
+
anyio==4.6.0
|
173 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
174 |
+
bitsandbytes==0.44.1
|
175 |
+
aiohttp==3.10.10
|
176 |
+
matplotlib==3.9.2
|
177 |
+
triton==3.0.0
|
178 |
+
tensorboard==2.18.0
|
179 |
+
nodeenv==1.9.1
|
180 |
+
fsspec==2024.6.1
|
181 |
+
webdataset==0.2.100
|
182 |
+
imageio-ffmpeg==0.5.1
|
183 |
+
mdurl==0.1.2
|
184 |
+
identify==2.6.1
|
185 |
+
h11==0.14.0
|
186 |
+
uvloop==0.20.0
|
187 |
+
rich==13.9.2
|
188 |
+
frechet-audio-distance==0.1.2
|
189 |
+
uvicorn==0.31.1
|
190 |
+
pytorch-fid==0.3.0
|
191 |
+
yt-dlp==2024.8.6
|
192 |
+
jiter==0.6.1
|
193 |
+
nest-asyncio==1.6.0
|
194 |
+
pre_commit==4.0.1
|
195 |
+
referencing==0.35.1
|
196 |
+
resampy==0.4.3
|
197 |
+
tensorboard-data-server==0.7.2
|
198 |
+
importlib_metadata==8.5.0
|
199 |
+
aiosignal==1.3.1
|
200 |
+
dill==0.3.8
|
201 |
+
prometheus-fastapi-instrumentator==7.0.0
|
202 |
+
ninja==1.11.1.1
|
203 |
+
nvidia-ml-py==12.560.30
|
204 |
+
moviepy==1.0.3
|
205 |
+
nvidia-cudnn-cu12==9.1.0.70
|
206 |
+
Markdown==3.7
|
207 |
+
ray==2.37.0
|
208 |
+
gradio==5.0.2
|
209 |
+
jsonschema==4.23.0
|
210 |
+
Pygments==2.18.0
|
211 |
+
nvidia-nccl-cu12==2.20.5
|
212 |
+
pillow==10.4.0
|
213 |
+
setuptools==75.1.0
|
214 |
+
jaraco.text==3.12.1
|
215 |
+
inflect==7.3.1
|
216 |
+
jaraco.collections==5.1.0
|
217 |
+
autocommand==2.2.2
|
218 |
+
tomli==2.0.1
|
219 |
+
jaraco.context==5.3.0
|
220 |
+
jaraco.functools==4.0.1
|
221 |
+
importlib_resources==6.4.0
|
222 |
+
wheel==0.43.0
|
223 |
+
packaging==24.1
|
224 |
+
backports.tarfile==1.2.0
|
225 |
+
importlib_metadata==8.0.0
|
226 |
+
typing_extensions==4.12.2
|
227 |
+
zipp==3.19.2
|
228 |
+
typeguard==4.3.0
|
229 |
+
more-itertools==10.3.0
|
230 |
+
platformdirs==4.2.2
|
wandb/run-20241023_090557-paei5sn7/files/wandb-metadata.json
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.4.0-196-generic-x86_64-with-glibc2.31",
|
3 |
+
"python": "3.11.10",
|
4 |
+
"startedAt": "2024-10-23T09:05:57.415346Z",
|
5 |
+
"args": [
|
6 |
+
"--local_rank=0",
|
7 |
+
"--actor_model_name_or_path",
|
8 |
+
"/data/align-anything/hantao/models/0916_ti_to_ti_sft",
|
9 |
+
"--reward_model_name_or_path",
|
10 |
+
"/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800",
|
11 |
+
"--reward_critic_model_name_or_path",
|
12 |
+
"/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800",
|
13 |
+
"--train_datasets",
|
14 |
+
"/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs",
|
15 |
+
"--train_template",
|
16 |
+
"spavl_ti2ti",
|
17 |
+
"--train_data_files",
|
18 |
+
"ti2ti_llf_prompt_only_tokenize.pt",
|
19 |
+
"--output_dir",
|
20 |
+
"../outputs/ppo_ti2ti_llf_1023_step_800",
|
21 |
+
"--save_interval",
|
22 |
+
"30"
|
23 |
+
],
|
24 |
+
"program": "-m align_anything.trainers.text_image_to_text_image.ppo",
|
25 |
+
"git": {
|
26 |
+
"remote": "https://github.com/PKU-Alignment/align-anything.git",
|
27 |
+
"commit": "6fde660afc9985323f147930eedf188a5699adc7"
|
28 |
+
},
|
29 |
+
"email": "[email protected]",
|
30 |
+
"root": "../outputs/ppo_ti2ti_llf_1023_step_800",
|
31 |
+
"host": "lyg0196",
|
32 |
+
"username": "align-anything",
|
33 |
+
"executable": "/home/align-anything/miniconda3/envs/hantao_cham/bin/python",
|
34 |
+
"cpu_count": 64,
|
35 |
+
"cpu_count_logical": 128,
|
36 |
+
"gpu": "[NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB]",
|
37 |
+
"gpu_count": 8,
|
38 |
+
"disk": {
|
39 |
+
"/": {
|
40 |
+
"total": "940744544256",
|
41 |
+
"used": "297219252224"
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"memory": {
|
45 |
+
"total": "540647583744"
|
46 |
+
},
|
47 |
+
"cpu": {
|
48 |
+
"count": 64,
|
49 |
+
"countLogical": 128
|
50 |
+
},
|
51 |
+
"gpu_nvidia": [
|
52 |
+
{
|
53 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
54 |
+
"memoryTotal": "85899345920",
|
55 |
+
"cudaCores": 6912,
|
56 |
+
"architecture": "Ampere"
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
60 |
+
"memoryTotal": "85899345920",
|
61 |
+
"cudaCores": 6912,
|
62 |
+
"architecture": "Ampere"
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
66 |
+
"memoryTotal": "85899345920",
|
67 |
+
"cudaCores": 6912,
|
68 |
+
"architecture": "Ampere"
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
72 |
+
"memoryTotal": "85899345920",
|
73 |
+
"cudaCores": 6912,
|
74 |
+
"architecture": "Ampere"
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
78 |
+
"memoryTotal": "85899345920",
|
79 |
+
"cudaCores": 6912,
|
80 |
+
"architecture": "Ampere"
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
84 |
+
"memoryTotal": "85899345920",
|
85 |
+
"cudaCores": 6912,
|
86 |
+
"architecture": "Ampere"
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
90 |
+
"memoryTotal": "85899345920",
|
91 |
+
"cudaCores": 6912,
|
92 |
+
"architecture": "Ampere"
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
96 |
+
"memoryTotal": "85899345920",
|
97 |
+
"cudaCores": 6912,
|
98 |
+
"architecture": "Ampere"
|
99 |
+
}
|
100 |
+
],
|
101 |
+
"cudaVersion": "12.4"
|
102 |
+
}
|
wandb/run-20241023_090557-paei5sn7/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"train/reward_critic_loss":0.40057873725891113,"train/reward_advantage":0.3540787100791931,"train/mean_generated_length":1,"_runtime":7168.249961391,"train/reward_value":-0.5498046875,"train/reward_critic_lr":5e-06,"train/kl_divergence":4.317548751831055,"_wandb":{"runtime":7168},"train/reward_with_kl_penalty":-0.1957259625196457,"train/max_generated_length":1,"train/actor_loss":-0.3540787100791931,"train/reward":-0.109375,"_step":236,"train/actor_lr":0,"train/reward_return":-0.1957259625196457,"train/step":236,"_timestamp":1.7296814635198205e+09}
|
wandb/run-20241023_090557-paei5sn7/logs/debug-internal.log
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-10-23T09:05:57.421439083Z","level":"INFO","msg":"using version","core version":"0.18.3"}
|
2 |
+
{"time":"2024-10-23T09:05:57.421466903Z","level":"INFO","msg":"created symlink","path":"../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug-core.log"}
|
3 |
+
{"time":"2024-10-23T09:05:57.425862224Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
|
4 |
+
{"time":"2024-10-23T09:05:57.446650446Z","level":"INFO","msg":"created new stream","id":"paei5sn7"}
|
5 |
+
{"time":"2024-10-23T09:05:57.44668639Z","level":"INFO","msg":"stream: started","id":"paei5sn7"}
|
6 |
+
{"time":"2024-10-23T09:05:57.446720716Z","level":"INFO","msg":"handler: started","stream_id":{"value":"paei5sn7"}}
|
7 |
+
{"time":"2024-10-23T09:05:57.446711826Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"paei5sn7"}}
|
8 |
+
{"time":"2024-10-23T09:05:57.446734515Z","level":"INFO","msg":"sender: started","stream_id":{"value":"paei5sn7"}}
|
9 |
+
{"time":"2024-10-23T09:05:58.065915529Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
10 |
+
{"time":"2024-10-23T09:05:58.068726401Z","level":"INFO","msg":"Starting system monitor"}
|
11 |
+
{"time":"2024-10-23T11:05:25.66533688Z","level":"INFO","msg":"Stopping system monitor"}
|
12 |
+
{"time":"2024-10-23T11:05:25.691953146Z","level":"INFO","msg":"Stopped system monitor"}
|
13 |
+
{"time":"2024-10-23T11:05:26.186981401Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
|
14 |
+
{"time":"2024-10-23T11:05:26.187015556Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
|
15 |
+
{"time":"2024-10-23T11:05:27.289810318Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
16 |
+
{"time":"2024-10-23T11:05:28.84134155Z","level":"INFO","msg":"stream: closing","id":"paei5sn7"}
|
17 |
+
{"time":"2024-10-23T11:05:28.841377348Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"paei5sn7"}}
|
18 |
+
{"time":"2024-10-23T11:05:28.841437021Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"paei5sn7"}}
|
19 |
+
{"time":"2024-10-23T11:05:28.841525942Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"paei5sn7"}}
|
20 |
+
{"time":"2024-10-23T11:05:28.842923992Z","level":"INFO","msg":"stream: closed","id":"paei5sn7"}
|
wandb/run-20241023_090557-paei5sn7/logs/debug.log
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
|
2 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Configure stats pid to 1071579
|
3 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
|
4 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
|
5 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
|
6 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
|
7 |
+
2024-10-23 09:05:57,409 WARNING MainThread:1071579 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.ppo
|
8 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.ppo'}
|
9 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Applying login settings: {}
|
10 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_init.py:_log_setup():532] Logging user logs to ../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug.log
|
11 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_init.py:_log_setup():533] Logging internal logs to ../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug-internal.log
|
12 |
+
2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_init.py:init():617] calling init triggers
|
13 |
+
2024-10-23 09:05:57,410 INFO MainThread:1071579 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
|
14 |
+
config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_prompt_batch_size': 8, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 2, 'actor_gradient_checkpointing': True, 'critic_gradient_checkpointing': True, 'actor_lr': 1e-05, 'actor_lr_scheduler_type': 'cosine', 'actor_lr_warmup_ratio': 0.03, 'actor_weight_decay': 0.01, 'critic_lr': 5e-06, 'critic_lr_scheduler_type': 'constant', 'critic_lr_warmup_ratio': 0.03, 'critic_weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'kl_coeff': 0.02, 'clip_range_ratio': 0.2, 'clip_range_score': 50.0, 'clip_range_value': 5.0, 'ptx_coeff': 16.0, 'gamma': 1.0, 'gae_lambda': 0.95, 'normalize_reward': False, 'update_iters': 1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs', 'train_template': 'spavl_ti2ti', 'train_size': 5000, 'train_split': None, 'train_subset': None, 'train_data_files': 'ti2ti_llf_prompt_only_tokenize.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': [], 'ptx_datasets': None, 'ptx_template': None, 'ptx_size': None, 'ptx_subset': None, 'ptx_split': None, 'ptx_data_files': None, 'ptx_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'ppo', 'output_dir': '../outputs/ppo_ti2ti_llf_1023_step_800', 'cache_dir': None, 'save_interval': 30.0}, 'model_cfgs': {'actor_model_name_or_path': '/data/align-anything/hantao/models/0916_ti_to_ti_sft', 'reward_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800', 'reward_critic_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800', 'trust_remote_code': True, 'model_max_length': 2048, 'temperature': 1.0, 'top_p': 1.0, 'repetition_penalty': 1.0}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': None}
|
15 |
+
2024-10-23 09:05:57,410 INFO MainThread:1071579 [wandb_init.py:init():667] starting backend
|
16 |
+
2024-10-23 09:05:57,410 INFO MainThread:1071579 [wandb_init.py:init():671] sending inform_init request
|
17 |
+
2024-10-23 09:05:57,414 INFO MainThread:1071579 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
18 |
+
2024-10-23 09:05:57,415 INFO MainThread:1071579 [wandb_init.py:init():684] backend started and connected
|
19 |
+
2024-10-23 09:05:57,418 INFO MainThread:1071579 [wandb_init.py:init():779] updated telemetry
|
20 |
+
2024-10-23 09:05:57,429 INFO MainThread:1071579 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
|
21 |
+
2024-10-23 09:05:58,062 INFO MainThread:1071579 [wandb_init.py:init():863] starting run threads in backend
|
22 |
+
2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_console_start():2465] atexit reg
|
23 |
+
2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_redirect():2313] redirect: wrap_raw
|
24 |
+
2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_redirect():2378] Wrapping output streams.
|
25 |
+
2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_redirect():2403] Redirects installed.
|
26 |
+
2024-10-23 09:05:58,197 INFO MainThread:1071579 [wandb_init.py:init():907] run started, returning control to user process
|
27 |
+
2024-10-23 11:05:25,658 INFO MainThread:1071579 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/paei5sn7
|
28 |
+
2024-10-23 11:05:25,662 INFO MainThread:1071579 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
|
29 |
+
2024-10-23 11:05:25,664 INFO MainThread:1071579 [wandb_run.py:_restore():2410] restore
|
30 |
+
2024-10-23 11:05:25,664 INFO MainThread:1071579 [wandb_run.py:_restore():2416] restore done
|
31 |
+
2024-10-23 11:05:28,807 INFO MainThread:1071579 [wandb_run.py:_footer_history_summary_info():4049] rendering history
|
32 |
+
2024-10-23 11:05:28,809 INFO MainThread:1071579 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
|
33 |
+
2024-10-23 11:05:28,839 INFO MainThread:1071579 [wandb_run.py:_footer_sync_info():4008] logging synced files
|
wandb/run-20241023_090557-paei5sn7/run-paei5sn7.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7875273453296c40e06ab7dc88ef1748a4c98f52eba4903dcc2530faa8b6a23d
|
3 |
+
size 6283361
|