htlou commited on
Commit
64ec51a
·
verified ·
1 Parent(s): 8540a58

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/run-20241023_090557-paei5sn7/run-paei5sn7.wandb filter=lfs diff=lfs merge=lfs -text
arguments.yaml ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bnb_cfgs:
2
+ bnb_4bit_compute_dtype: float16
3
+ bnb_4bit_quant_type: nf4
4
+ bnb_4bit_use_double_quant: true
5
+ load_in_4bit: true
6
+ load_in_8bit: false
7
+ use_bnb: false
8
+ data_cfgs:
9
+ eval_data_files: null
10
+ eval_datasets: null
11
+ eval_optional_args: []
12
+ eval_size: null
13
+ eval_split: null
14
+ eval_subset: null
15
+ eval_template: null
16
+ ptx_data_files: null
17
+ ptx_datasets: null
18
+ ptx_optional_args: []
19
+ ptx_size: null
20
+ ptx_split: null
21
+ ptx_subset: null
22
+ ptx_template: null
23
+ train_data_files: ti2ti_llf_prompt_only_tokenize.pt
24
+ train_datasets: /data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs
25
+ train_optional_args: []
26
+ train_size: 5000
27
+ train_split: null
28
+ train_subset: null
29
+ train_template: spavl_ti2ti
30
+ logger_cfgs:
31
+ cache_dir: null
32
+ log_project: align-anything
33
+ log_run_name: ppo
34
+ log_type: wandb
35
+ output_dir: ../outputs/ppo_ti2ti_llf_1023_step_800
36
+ save_interval: 30.0
37
+ lora_cfgs:
38
+ inference_mode: false
39
+ lora_alpha: 16
40
+ lora_dropout: 0.1
41
+ r: 16
42
+ save_full_model: true
43
+ target_modules:
44
+ - q_proj
45
+ - v_proj
46
+ task_type: TaskType.CAUSAL_LM
47
+ use_lora: false
48
+ model_cfgs:
49
+ actor_model_name_or_path: /data/align-anything/hantao/models/0916_ti_to_ti_sft
50
+ model_max_length: 2048
51
+ repetition_penalty: 1.0
52
+ reward_critic_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800
53
+ reward_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800
54
+ temperature: 1.0
55
+ top_p: 1.0
56
+ trust_remote_code: true
57
+ special_tokens: null
58
+ train_cfgs:
59
+ actor_gradient_checkpointing: true
60
+ actor_lr: 1.0e-05
61
+ actor_lr_scheduler_type: cosine
62
+ actor_lr_warmup_ratio: 0.03
63
+ actor_weight_decay: 0.01
64
+ adam_betas:
65
+ - 0.9
66
+ - 0.95
67
+ bf16: true
68
+ clip_range_ratio: 0.2
69
+ clip_range_score: 50.0
70
+ clip_range_value: 5.0
71
+ critic_gradient_checkpointing: true
72
+ critic_lr: 5.0e-06
73
+ critic_lr_scheduler_type: constant
74
+ critic_lr_warmup_ratio: 0.03
75
+ critic_weight_decay: 0.0
76
+ ds_cfgs: ds_z3_config.json
77
+ epochs: 3
78
+ eval_interval: 10
79
+ eval_strategy: epoch
80
+ fp16: false
81
+ freeze_language_model: true
82
+ freeze_mm_proj: true
83
+ freeze_vision_tower: false
84
+ gae_lambda: 0.95
85
+ gamma: 1.0
86
+ gradient_accumulation_steps: 2
87
+ kl_coeff: 0.02
88
+ normalize_reward: false
89
+ per_device_eval_batch_size: 8
90
+ per_device_prompt_batch_size: 8
91
+ per_device_train_batch_size: 8
92
+ ptx_coeff: 16.0
93
+ seed: 42
94
+ update_iters: 1
config.json ADDED
The diff for this file is too large to render. See raw diff
 
environ.txt ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CONDA_DEFAULT_ENV=hantao_cham
2
+ CONDA_EXE=/home/align-anything/miniconda3/bin/conda
3
+ CONDA_PREFIX=/home/align-anything/miniconda3/envs/hantao_cham
4
+ CONDA_PREFIX_1=/home/align-anything/miniconda3
5
+ CONDA_PREFIX_2=/home/align-anything/miniconda3/envs/hantao_cham
6
+ CONDA_PREFIX_3=/home/align-anything/miniconda3/envs/hantao_stable
7
+ CONDA_PREFIX_4=/home/align-anything/miniconda3/envs/hantao_cham
8
+ CONDA_PREFIX_5=/home/align-anything/miniconda3/envs/hantao_stable
9
+ CONDA_PREFIX_6=/home/align-anything/miniconda3/envs/hantao_cham
10
+ CONDA_PREFIX_7=/home/align-anything/miniconda3/envs/hantao_stable
11
+ CONDA_PROMPT_MODIFIER=(hantao_cham)
12
+ CONDA_PYTHON_EXE=/home/align-anything/miniconda3/bin/python
13
+ CONDA_SHLVL=8
14
+ CRASHDIR=/etc/ShellCrash
15
+ CROSS_RANK=0
16
+ CROSS_SIZE=1
17
+ CUDA_MODULE_LOADING=LAZY
18
+ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
19
+ DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/2000/bus
20
+ HOME=/home/align-anything
21
+ LANG=en_US.UTF-8
22
+ LD_LIBRARY_PATH=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/../../lib64:
23
+ LESSCLOSE=/usr/bin/lesspipe %s %s
24
+ LESSOPEN=| /usr/bin/lesspipe %s
25
+ LOCAL_RANK=0
26
+ LOCAL_SIZE=8
27
+ LOGLEVEL=WARNING
28
+ LOGNAME=align-anything
29
+ LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
30
+ MASTER_ADDR=127.0.0.1
31
+ MASTER_PORT=23139
32
+ MOTD_SHOWN=pam
33
+ OLDPWD=/data/align-anything/hantao/align-anything/projects/text_image_to_text_image
34
+ PATH=/home/align-anything/miniconda3/envs/hantao_cham/bin:/home/align-anything/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
35
+ PWD=/data/align-anything/hantao/align-anything/scripts
36
+ PYGAME_HIDE_SUPPORT_PROMPT=1
37
+ PYTHONHASHSEED=42
38
+ PYTHONPATH=/data/align-anything/hantao/align-anything
39
+ QT_QPA_FONTDIR=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/qt/fonts
40
+ QT_QPA_PLATFORM_PLUGIN_PATH=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/qt/plugins
41
+ RANK=0
42
+ SHELL=/bin/bash
43
+ SHLVL=3
44
+ SSH_CLIENT=111.205.230.212 43947 30600
45
+ SSH_CONNECTION=111.205.230.212 44215 10.10.212.196 30600
46
+ SSH_TTY=/dev/pts/1
47
+ TERM=screen
48
+ TMUX=/tmp/tmux-2000/default,53635,1
49
+ TMUX_PANE=%1
50
+ USER=align-anything
51
+ WANDB_API_KEY=7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33
52
+ WANDB_MODE=online
53
+ WANDB_SERVICE=2-1071579-tcp-localhost-45673
54
+ WORLD_SIZE=8
55
+ XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
56
+ XDG_RUNTIME_DIR=/run/user/2000
57
+ XDG_SESSION_CLASS=user
58
+ XDG_SESSION_ID=15
59
+ XDG_SESSION_TYPE=tty
60
+ _=/home/align-anything/miniconda3/envs/hantao_cham/bin/deepspeed
61
+ _CE_CONDA=
62
+ _CE_M=
preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 512,
4
+ "width": 512
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 1.0,
13
+ 1.0,
14
+ 1.0
15
+ ],
16
+ "image_processor_type": "ChameleonImageProcessor",
17
+ "image_std": [
18
+ 1.0,
19
+ 1.0,
20
+ 1.0
21
+ ],
22
+ "processor_class": "ChameleonProcessor",
23
+ "resample": 1,
24
+ "rescale_factor": 0.0078,
25
+ "size": {
26
+ "shortest_edge": 512
27
+ }
28
+ }
processor_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 1024,
3
+ "image_token": "<image>",
4
+ "processor_class": "ChameleonProcessor"
5
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e463e8c312e3fcebc9ff47999f2efbe72a12805c69e3ba885a8b38b4ebe8d478
3
+ size 14165009930
script.sh ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright 2024 PKU-Alignment Team. All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # ==============================================================================
17
+
18
+ # Initialize variables
19
+ # For wandb online logging
20
+ export WANDB_API_KEY="7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33"
21
+ # Source the setup script
22
+ # source ./setup.sh
23
+
24
+ export WANDB_MODE=online
25
+
26
+ ACTOR_MODEL_NAME_OR_PATH="/data/align-anything/hantao/models/0916_ti_to_ti_sft"
27
+ CRITIC_MODEL_NAME_OR_PATH="/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800"
28
+ REWARD_MODEL_NAME_OR_PATH="/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800"
29
+ TRAIN_DATASETS="/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs"
30
+ PTX_DATASETS="/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs"
31
+ OUTPUT_DIR="../outputs/ppo_ti2ti_llf_1023_step_800"
32
+
33
+ # Source the setup script
34
+ source ./setup.sh
35
+
36
+ # Execute deepspeed command
37
+ deepspeed \
38
+ --master_port ${MASTER_PORT} \
39
+ --module align_anything.trainers.text_image_to_text_image.ppo \
40
+ --actor_model_name_or_path ${ACTOR_MODEL_NAME_OR_PATH} \
41
+ --reward_model_name_or_path ${REWARD_MODEL_NAME_OR_PATH} \
42
+ --reward_critic_model_name_or_path ${CRITIC_MODEL_NAME_OR_PATH} \
43
+ --train_datasets ${TRAIN_DATASETS} \
44
+ --train_template spavl_ti2ti \
45
+ --train_data_files ti2ti_llf_prompt_only_tokenize.pt \
46
+ --output_dir ${OUTPUT_DIR} \
47
+ --save_interval 30
48
+
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "<reserved08706>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<unk>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
wandb/debug-internal.log ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-23T09:05:57.421439083Z","level":"INFO","msg":"using version","core version":"0.18.3"}
2
+ {"time":"2024-10-23T09:05:57.421466903Z","level":"INFO","msg":"created symlink","path":"../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug-core.log"}
3
+ {"time":"2024-10-23T09:05:57.425862224Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
4
+ {"time":"2024-10-23T09:05:57.446650446Z","level":"INFO","msg":"created new stream","id":"paei5sn7"}
5
+ {"time":"2024-10-23T09:05:57.44668639Z","level":"INFO","msg":"stream: started","id":"paei5sn7"}
6
+ {"time":"2024-10-23T09:05:57.446720716Z","level":"INFO","msg":"handler: started","stream_id":{"value":"paei5sn7"}}
7
+ {"time":"2024-10-23T09:05:57.446711826Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"paei5sn7"}}
8
+ {"time":"2024-10-23T09:05:57.446734515Z","level":"INFO","msg":"sender: started","stream_id":{"value":"paei5sn7"}}
9
+ {"time":"2024-10-23T09:05:58.065915529Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
10
+ {"time":"2024-10-23T09:05:58.068726401Z","level":"INFO","msg":"Starting system monitor"}
11
+ {"time":"2024-10-23T11:05:25.66533688Z","level":"INFO","msg":"Stopping system monitor"}
12
+ {"time":"2024-10-23T11:05:25.691953146Z","level":"INFO","msg":"Stopped system monitor"}
13
+ {"time":"2024-10-23T11:05:26.186981401Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
14
+ {"time":"2024-10-23T11:05:26.187015556Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
15
+ {"time":"2024-10-23T11:05:27.289810318Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
16
+ {"time":"2024-10-23T11:05:28.84134155Z","level":"INFO","msg":"stream: closing","id":"paei5sn7"}
17
+ {"time":"2024-10-23T11:05:28.841377348Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"paei5sn7"}}
18
+ {"time":"2024-10-23T11:05:28.841437021Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"paei5sn7"}}
19
+ {"time":"2024-10-23T11:05:28.841525942Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"paei5sn7"}}
20
+ {"time":"2024-10-23T11:05:28.842923992Z","level":"INFO","msg":"stream: closed","id":"paei5sn7"}
wandb/debug.log ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
2
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Configure stats pid to 1071579
3
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
4
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
5
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
6
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
7
+ 2024-10-23 09:05:57,409 WARNING MainThread:1071579 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.ppo
8
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.ppo'}
9
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_init.py:_log_setup():532] Logging user logs to ../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug.log
11
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_init.py:_log_setup():533] Logging internal logs to ../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug-internal.log
12
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_init.py:init():617] calling init triggers
13
+ 2024-10-23 09:05:57,410 INFO MainThread:1071579 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
14
+ config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_prompt_batch_size': 8, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 2, 'actor_gradient_checkpointing': True, 'critic_gradient_checkpointing': True, 'actor_lr': 1e-05, 'actor_lr_scheduler_type': 'cosine', 'actor_lr_warmup_ratio': 0.03, 'actor_weight_decay': 0.01, 'critic_lr': 5e-06, 'critic_lr_scheduler_type': 'constant', 'critic_lr_warmup_ratio': 0.03, 'critic_weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'kl_coeff': 0.02, 'clip_range_ratio': 0.2, 'clip_range_score': 50.0, 'clip_range_value': 5.0, 'ptx_coeff': 16.0, 'gamma': 1.0, 'gae_lambda': 0.95, 'normalize_reward': False, 'update_iters': 1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs', 'train_template': 'spavl_ti2ti', 'train_size': 5000, 'train_split': None, 'train_subset': None, 'train_data_files': 'ti2ti_llf_prompt_only_tokenize.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': [], 'ptx_datasets': None, 'ptx_template': None, 'ptx_size': None, 'ptx_subset': None, 'ptx_split': None, 'ptx_data_files': None, 'ptx_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'ppo', 'output_dir': '../outputs/ppo_ti2ti_llf_1023_step_800', 'cache_dir': None, 'save_interval': 30.0}, 'model_cfgs': {'actor_model_name_or_path': '/data/align-anything/hantao/models/0916_ti_to_ti_sft', 'reward_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800', 'reward_critic_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800', 'trust_remote_code': True, 'model_max_length': 2048, 'temperature': 1.0, 'top_p': 1.0, 'repetition_penalty': 1.0}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': None}
15
+ 2024-10-23 09:05:57,410 INFO MainThread:1071579 [wandb_init.py:init():667] starting backend
16
+ 2024-10-23 09:05:57,410 INFO MainThread:1071579 [wandb_init.py:init():671] sending inform_init request
17
+ 2024-10-23 09:05:57,414 INFO MainThread:1071579 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
18
+ 2024-10-23 09:05:57,415 INFO MainThread:1071579 [wandb_init.py:init():684] backend started and connected
19
+ 2024-10-23 09:05:57,418 INFO MainThread:1071579 [wandb_init.py:init():779] updated telemetry
20
+ 2024-10-23 09:05:57,429 INFO MainThread:1071579 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
21
+ 2024-10-23 09:05:58,062 INFO MainThread:1071579 [wandb_init.py:init():863] starting run threads in backend
22
+ 2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_console_start():2465] atexit reg
23
+ 2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_redirect():2313] redirect: wrap_raw
24
+ 2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_redirect():2378] Wrapping output streams.
25
+ 2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_redirect():2403] Redirects installed.
26
+ 2024-10-23 09:05:58,197 INFO MainThread:1071579 [wandb_init.py:init():907] run started, returning control to user process
27
+ 2024-10-23 11:05:25,658 INFO MainThread:1071579 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/paei5sn7
28
+ 2024-10-23 11:05:25,662 INFO MainThread:1071579 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
29
+ 2024-10-23 11:05:25,664 INFO MainThread:1071579 [wandb_run.py:_restore():2410] restore
30
+ 2024-10-23 11:05:25,664 INFO MainThread:1071579 [wandb_run.py:_restore():2416] restore done
31
+ 2024-10-23 11:05:28,807 INFO MainThread:1071579 [wandb_run.py:_footer_history_summary_info():4049] rendering history
32
+ 2024-10-23 11:05:28,809 INFO MainThread:1071579 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
33
+ 2024-10-23 11:05:28,839 INFO MainThread:1071579 [wandb_run.py:_footer_sync_info():4008] logging synced files
wandb/run-20241023_090557-paei5sn7/files/config.yaml ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.18.3
4
+ m: []
5
+ python_version: 3.11.10
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 11
10
+ - 41
11
+ - 49
12
+ - 51
13
+ - 55
14
+ - 71
15
+ - 83
16
+ - 98
17
+ - 105
18
+ "2":
19
+ - 1
20
+ - 11
21
+ - 41
22
+ - 49
23
+ - 51
24
+ - 55
25
+ - 71
26
+ - 83
27
+ - 98
28
+ - 105
29
+ "3":
30
+ - 2
31
+ - 13
32
+ - 16
33
+ - 23
34
+ - 55
35
+ - 61
36
+ "4": 3.11.10
37
+ "5": 0.18.3
38
+ "6": 4.44.0.dev0
39
+ "8":
40
+ - 5
41
+ "12": 0.18.3
42
+ "13": linux-x86_64
43
+ bnb_cfgs:
44
+ value:
45
+ bnb_4bit_compute_dtype: float16
46
+ bnb_4bit_quant_type: nf4
47
+ bnb_4bit_use_double_quant: true
48
+ load_in_4bit: true
49
+ load_in_8bit: false
50
+ use_bnb: false
51
+ data_cfgs:
52
+ value:
53
+ eval_data_files: null
54
+ eval_datasets: null
55
+ eval_optional_args: []
56
+ eval_size: null
57
+ eval_split: null
58
+ eval_subset: null
59
+ eval_template: null
60
+ ptx_data_files: null
61
+ ptx_datasets: null
62
+ ptx_optional_args: []
63
+ ptx_size: null
64
+ ptx_split: null
65
+ ptx_subset: null
66
+ ptx_template: null
67
+ train_data_files: ti2ti_llf_prompt_only_tokenize.pt
68
+ train_datasets: /data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs
69
+ train_optional_args: []
70
+ train_size: 5000
71
+ train_split: null
72
+ train_subset: null
73
+ train_template: spavl_ti2ti
74
+ logger_cfgs:
75
+ value:
76
+ cache_dir: null
77
+ log_project: align-anything
78
+ log_run_name: ppo
79
+ log_type: wandb
80
+ output_dir: ../outputs/ppo_ti2ti_llf_1023_step_800
81
+ save_interval: 30
82
+ lora_cfgs:
83
+ value:
84
+ inference_mode: false
85
+ lora_alpha: 16
86
+ lora_dropout: 0.1
87
+ r: 16
88
+ save_full_model: true
89
+ target_modules:
90
+ - q_proj
91
+ - v_proj
92
+ task_type: TaskType.CAUSAL_LM
93
+ use_lora: false
94
+ model_cfgs:
95
+ value:
96
+ actor_model_name_or_path: /data/align-anything/hantao/models/0916_ti_to_ti_sft
97
+ model_max_length: 2048
98
+ repetition_penalty: 1
99
+ reward_critic_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800
100
+ reward_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800
101
+ temperature: 1
102
+ top_p: 1
103
+ trust_remote_code: true
104
+ special_tokens:
105
+ value: null
106
+ train_cfgs:
107
+ value:
108
+ actor_gradient_checkpointing: true
109
+ actor_lr: 1e-05
110
+ actor_lr_scheduler_type: cosine
111
+ actor_lr_warmup_ratio: 0.03
112
+ actor_weight_decay: 0.01
113
+ adam_betas:
114
+ - 0.9
115
+ - 0.95
116
+ bf16: true
117
+ clip_range_ratio: 0.2
118
+ clip_range_score: 50
119
+ clip_range_value: 5
120
+ critic_gradient_checkpointing: true
121
+ critic_lr: 5e-06
122
+ critic_lr_scheduler_type: constant
123
+ critic_lr_warmup_ratio: 0.03
124
+ critic_weight_decay: 0
125
+ ds_cfgs: ds_z3_config.json
126
+ epochs: 3
127
+ eval_interval: 10
128
+ eval_strategy: epoch
129
+ fp16: false
130
+ freeze_language_model: true
131
+ freeze_mm_proj: true
132
+ freeze_vision_tower: false
133
+ gae_lambda: 0.95
134
+ gamma: 1
135
+ gradient_accumulation_steps: 2
136
+ kl_coeff: 0.02
137
+ normalize_reward: false
138
+ per_device_eval_batch_size: 8
139
+ per_device_prompt_batch_size: 8
140
+ per_device_train_batch_size: 8
141
+ ptx_coeff: 16
142
+ seed: 42
143
+ update_iters: 1
wandb/run-20241023_090557-paei5sn7/files/output.log ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ***** Running training *****
2
+ Training 1/3 epoch: 0%| | 0/237 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
3
+ Training 1/3 epoch (reward -0.3069): 13%|██████████████████████████████▌ | 30/237 [1:00:27<12:52:22, 223.88s/it]
4
+ [2024-10-23 09:13:46,654] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
5
+ [2024-10-23 09:13:50,992] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
6
+ [2024-10-23 09:18:33,696] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
7
+ [2024-10-23 09:18:38,270] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
8
+ [2024-10-23 09:23:48,615] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
9
+ [2024-10-23 09:23:53,939] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
10
+ [2024-10-23 09:27:04,368] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
11
+ [2024-10-23 09:27:08,626] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
12
+ [2024-10-23 09:30:00,787] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
13
+ [2024-10-23 09:30:04,867] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
14
+ [2024-10-23 09:32:26,802] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
15
+ [2024-10-23 09:32:30,881] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
16
+ [2024-10-23 09:36:44,344] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
17
+ [2024-10-23 09:36:47,973] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
18
+ [2024-10-23 09:39:01,718] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
19
+ [2024-10-23 09:39:01,719] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[9.908858470377793e-06, 9.908858470377793e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
20
+ [2024-10-23 09:39:01,719] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=14.571748606590951, CurrSamplesPerSec=15.598687308172462, MemAllocated=33.2GB, MaxMemAllocated=48.11GB
21
+ [2024-10-23 09:39:05,687] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
22
+ [2024-10-23 09:39:05,688] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
23
+ [2024-10-23 09:39:05,689] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=15.239809655137655, CurrSamplesPerSec=16.432232979486614, MemAllocated=33.2GB, MaxMemAllocated=48.11GB
24
+ [2024-10-23 09:41:40,302] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
25
+ [2024-10-23 09:41:43,921] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
26
+ [2024-10-23 09:58:00,759] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
27
+ [2024-10-23 09:58:06,692] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
28
+ [2024-10-23 10:06:16,814] [WARNING] [stage3.py:2104:step] 3 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
29
+ [2024-10-23 10:06:22,976] [WARNING] [stage3.py:2104:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
30
+ Saving checkpoint at step 30 ...
31
+ Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
32
+ Saving 16-bit model...
33
+ [2024-10-23 10:06:35,014] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step15 is about to be saved!
34
+ [2024-10-23 10:06:35,016] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin, tag: global_step15
35
+ [2024-10-23 10:06:35,016] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin...
36
+ [2024-10-23 10:06:58,291] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin.
37
+ [2024-10-23 10:06:58,293] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step15 is ready now!
38
+ Model saved!
39
+ Saving 16-bit model...
40
+ [2024-10-23 10:07:09,145] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step15 is about to be saved!
41
+ [2024-10-23 10:07:09,146] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin, tag: global_step15
42
+ [2024-10-23 10:07:09,146] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin...
43
+ [2024-10-23 10:07:32,380] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin.
44
+ [2024-10-23 10:07:32,381] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step15 is ready now!
45
+ Model saved!
46
+ Model saved!
47
+ Checkpoint saved.
48
+ [2024-10-23 10:15:51,652] [WARNING] [stage3.py:2104:step] 3 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
49
+ [2024-10-23 10:15:57,685] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
50
+ [2024-10-23 10:16:30,368] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
51
+ [2024-10-23 10:16:33,757] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
52
+ [2024-10-23 10:17:06,123] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
53
+ [2024-10-23 10:17:09,627] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
54
+ [2024-10-23 10:17:35,330] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
55
+ [2024-10-23 10:17:38,641] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
56
+ [2024-10-23 10:18:10,448] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
57
+ [2024-10-23 10:18:10,449] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[9.470431355738257e-06, 9.470431355738257e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
58
+ [2024-10-23 10:18:10,450] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=14.073356193448515, CurrSamplesPerSec=18.590383641536306, MemAllocated=33.2GB, MaxMemAllocated=52.44GB
59
+ [2024-10-23 10:18:13,794] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
60
+ [2024-10-23 10:18:13,795] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
61
+ [2024-10-23 10:18:13,795] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=14.764142588014744, CurrSamplesPerSec=19.524800757549972, MemAllocated=33.2GB, MaxMemAllocated=52.44GB
62
+ [2024-10-23 10:18:45,865] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
63
+ [2024-10-23 10:18:49,172] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
64
+ [2024-10-23 10:19:24,641] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
65
+ [2024-10-23 10:19:27,927] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
66
+ [2024-10-23 10:19:54,182] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
67
+ [2024-10-23 10:19:57,506] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
68
+ [2024-10-23 10:20:51,612] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
69
+ [2024-10-23 10:20:54,879] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
70
+ [2024-10-23 10:21:17,299] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
71
+ [2024-10-23 10:21:20,616] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
72
+ [2024-10-23 10:21:42,642] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
73
+ [2024-10-23 10:21:45,941] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
74
+ [2024-10-23 10:22:12,002] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
75
+ [2024-10-23 10:22:15,333] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
76
+ [2024-10-23 10:22:36,950] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
77
+ [2024-10-23 10:22:40,236] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
78
+ [2024-10-23 10:23:01,348] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
79
+ [2024-10-23 10:23:01,349] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[8.70045279830626e-06, 8.70045279830626e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
80
+ [2024-10-23 10:23:01,350] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=15.461989587711498, CurrSamplesPerSec=19.013700639766963, MemAllocated=33.15GB, MaxMemAllocated=52.44GB
81
+ [2024-10-23 10:23:04,658] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
82
+ [2024-10-23 10:23:04,659] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
83
+ [2024-10-23 10:23:04,659] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=16.198688238179816, CurrSamplesPerSec=19.81263984833098, MemAllocated=33.15GB, MaxMemAllocated=52.44GB
84
+ Saving checkpoint at step 60 ...
85
+ Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
86
+ Saving 16-bit model...
87
+ [2024-10-23 10:23:19,118] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step30 is about to be saved!
88
+ [2024-10-23 10:23:19,120] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin, tag: global_step30
89
+ [2024-10-23 10:23:19,120] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin...
90
+ [2024-10-23 10:23:37,909] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin.
91
+ [2024-10-23 10:23:37,912] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step30 is ready now!
92
+ Model saved!
93
+ Saving 16-bit model...
94
+ [2024-10-23 10:23:49,578] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step30 is about to be saved!
95
+ [2024-10-23 10:23:49,578] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin, tag: global_step30
96
+ [2024-10-23 10:23:49,579] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin...
97
+ [2024-10-23 10:24:12,803] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin.
98
+ [2024-10-23 10:24:12,804] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step30 is ready now!
99
+ Model saved!
100
+ Model saved!
101
+ Checkpoint saved.
102
+ [2024-10-23 10:25:22,814] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
103
+ [2024-10-23 10:25:26,107] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
104
+ [2024-10-23 10:26:58,273] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
105
+ [2024-10-23 10:27:01,584] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
106
+ [2024-10-23 10:28:01,902] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[7.656028585269017e-06, 7.656028585269017e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
107
+ [2024-10-23 10:28:01,903] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=16.36638951494034, CurrSamplesPerSec=29.39789939491861, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
108
+ [2024-10-23 10:28:05,224] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
109
+ [2024-10-23 10:28:05,225] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=17.12711182391296, CurrSamplesPerSec=30.71016374643415, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
110
+ [2024-10-23 10:28:25,905] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
111
+ [2024-10-23 10:28:29,159] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
112
+ [2024-10-23 10:29:13,184] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
113
+ [2024-10-23 10:29:16,439] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
114
+ [2024-10-23 10:29:37,209] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
115
+ [2024-10-23 10:29:40,499] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
116
+ Saving checkpoint at step 90 ...
117
+ Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
118
+ Saving 16-bit model...
119
+ [2024-10-23 10:30:18,686] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step45 is about to be saved!
120
+ [2024-10-23 10:30:18,688] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin, tag: global_step45
121
+ [2024-10-23 10:30:18,688] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin...
122
+ [2024-10-23 10:30:38,714] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin.
123
+ [2024-10-23 10:30:38,715] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step45 is ready now!
124
+ Model saved!
125
+ Saving 16-bit model...
126
+ [2024-10-23 10:30:48,994] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step45 is about to be saved!
127
+ [2024-10-23 10:30:48,995] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin, tag: global_step45
128
+ [2024-10-23 10:30:48,995] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin...
129
+ [2024-10-23 10:31:12,474] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin.
130
+ [2024-10-23 10:31:12,476] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step45 is ready now!
131
+ Model saved!
132
+ Model saved!
133
+ Checkpoint saved.
134
+ [2024-10-23 10:33:08,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[6.41461888258465e-06, 6.41461888258465e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
135
+ [2024-10-23 10:33:08,192] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=16.840242031082227, CurrSamplesPerSec=18.082056043756573, MemAllocated=33.1GB, MaxMemAllocated=52.44GB
136
+ [2024-10-23 10:33:11,652] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
137
+ [2024-10-23 10:33:11,653] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=17.604303267186207, CurrSamplesPerSec=19.299485552228777, MemAllocated=33.1GB, MaxMemAllocated=52.44GB
138
+ [2024-10-23 10:37:08,335] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[5.068293368829755e-06, 5.068293368829755e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
139
+ [2024-10-23 10:37:08,336] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=17.145914370621718, CurrSamplesPerSec=19.351638771074402, MemAllocated=33.14GB, MaxMemAllocated=52.44GB
140
+ [2024-10-23 10:37:11,634] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
141
+ [2024-10-23 10:37:11,635] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=17.89246566680119, CurrSamplesPerSec=19.82526412245786, MemAllocated=33.14GB, MaxMemAllocated=52.44GB
142
+ Saving checkpoint at step 120 ...
143
+ Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
144
+ Saving 16-bit model...
145
+ [2024-10-23 10:37:26,476] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step60 is about to be saved!
146
+ [2024-10-23 10:37:26,477] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin, tag: global_step60
147
+ [2024-10-23 10:37:26,478] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin...
148
+ [2024-10-23 10:37:44,898] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin.
149
+ [2024-10-23 10:37:44,900] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step60 is ready now!
150
+ Model saved!
151
+ Saving 16-bit model...
152
+ [2024-10-23 10:37:55,791] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step60 is about to be saved!
153
+ [2024-10-23 10:37:55,792] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin, tag: global_step60
154
+ [2024-10-23 10:37:55,792] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin...
155
+ [2024-10-23 10:38:18,798] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin.
156
+ [2024-10-23 10:38:18,799] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step60 is ready now!
157
+ Model saved!
158
+ Model saved!
159
+ Checkpoint saved.
160
+ [2024-10-23 10:42:13,477] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[3.7169028483301333e-06, 3.7169028483301333e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
161
+ [2024-10-23 10:42:13,478] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=17.37727340710681, CurrSamplesPerSec=18.834808666541562, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
162
+ [2024-10-23 10:42:16,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
163
+ [2024-10-23 10:42:16,796] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=18.13780306709264, CurrSamplesPerSec=19.724826811206178, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
164
+ Saving checkpoint at step 150 ...
165
+ Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
166
+ Saving 16-bit model...
167
+ [2024-10-23 10:44:30,480] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step75 is about to be saved!
168
+ [2024-10-23 10:44:30,481] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin, tag: global_step75
169
+ [2024-10-23 10:44:30,482] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin...
170
+ [2024-10-23 10:44:50,645] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin.
171
+ [2024-10-23 10:44:50,646] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step75 is ready now!
172
+ Model saved!
173
+ Saving 16-bit model...
174
+ [2024-10-23 10:45:00,723] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step75 is about to be saved!
175
+ [2024-10-23 10:45:00,724] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin, tag: global_step75
176
+ [2024-10-23 10:45:00,725] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin...
177
+ [2024-10-23 10:45:22,429] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin.
178
+ [2024-10-23 10:45:22,430] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step75 is ready now!
179
+ Model saved!
180
+ Model saved!
181
+ Checkpoint saved.
182
+ [2024-10-23 10:47:09,650] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[2.4606737737909696e-06, 2.4606737737909696e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
183
+ [2024-10-23 10:47:09,651] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=17.61415636579215, CurrSamplesPerSec=18.93111536799049, MemAllocated=33.29GB, MaxMemAllocated=52.44GB
184
+ [2024-10-23 10:47:13,054] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
185
+ [2024-10-23 10:47:13,055] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=18.395644821730034, CurrSamplesPerSec=19.45837151531338, MemAllocated=33.29GB, MaxMemAllocated=52.44GB
186
+ [2024-10-23 10:48:21,406] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
187
+ [2024-10-23 10:48:24,761] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
188
+ [2024-10-23 10:48:45,180] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
189
+ [2024-10-23 10:48:48,458] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
190
+ [2024-10-23 10:50:43,651] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
191
+ [2024-10-23 10:50:46,969] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
192
+ [2024-10-23 10:51:07,962] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[1.3927749088052218e-06, 1.3927749088052218e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
193
+ [2024-10-23 10:51:07,962] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=17.746439174563342, CurrSamplesPerSec=17.855702558049163, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
194
+ [2024-10-23 10:51:11,460] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
195
+ [2024-10-23 10:51:11,461] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=18.529656252919512, CurrSamplesPerSec=18.696832537132607, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
196
+ Saving checkpoint at step 180 ...
197
+ Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
198
+ Saving 16-bit model...
199
+ [2024-10-23 10:51:23,210] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step90 is about to be saved!
200
+ [2024-10-23 10:51:23,211] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin, tag: global_step90
201
+ [2024-10-23 10:51:23,211] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin...
202
+ [2024-10-23 10:51:42,948] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin.
203
+ [2024-10-23 10:51:42,950] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step90 is ready now!
204
+ Model saved!
205
+ Saving 16-bit model...
206
+ [2024-10-23 10:51:51,561] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step90 is about to be saved!
207
+ [2024-10-23 10:51:51,562] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin, tag: global_step90
208
+ [2024-10-23 10:51:51,563] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin...
209
+ [2024-10-23 10:52:13,036] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin.
210
+ [2024-10-23 10:52:13,038] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step90 is ready now!
211
+ Model saved!
212
+ Model saved!
213
+ Checkpoint saved.
214
+ [2024-10-23 10:53:45,229] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
215
+ [2024-10-23 10:53:48,631] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
216
+ [2024-10-23 10:56:08,616] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[5.924074268766422e-07, 5.924074268766422e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
217
+ [2024-10-23 10:56:08,617] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=17.839255597575846, CurrSamplesPerSec=18.94653826925029, MemAllocated=33.12GB, MaxMemAllocated=52.44GB
218
+ [2024-10-23 10:56:11,882] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
219
+ [2024-10-23 10:56:11,883] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=18.63801951889686, CurrSamplesPerSec=19.80692964117057, MemAllocated=33.12GB, MaxMemAllocated=52.44GB
220
+ Saving checkpoint at step 210 ...
221
+ Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
222
+ Saving 16-bit model...
223
+ [2024-10-23 10:58:21,816] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step105 is about to be saved!
224
+ [2024-10-23 10:58:21,818] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin, tag: global_step105
225
+ [2024-10-23 10:58:21,818] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin...
226
+ [2024-10-23 10:58:38,899] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin.
227
+ [2024-10-23 10:58:38,901] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step105 is ready now!
228
+ Model saved!
229
+ Saving 16-bit model...
230
+ [2024-10-23 10:58:47,269] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step105 is about to be saved!
231
+ [2024-10-23 10:58:47,270] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin, tag: global_step105
232
+ [2024-10-23 10:58:47,270] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin...
233
+ [2024-10-23 10:59:09,672] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin.
234
+ [2024-10-23 10:59:09,674] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step105 is ready now!
235
+ Model saved!
236
+ Model saved!
237
+ Checkpoint saved.
238
+ [2024-10-23 11:00:41,653] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
239
+ [2024-10-23 11:00:44,969] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
240
+ [2024-10-23 11:01:05,305] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[1.1893092270227724e-07, 1.1893092270227724e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
241
+ [2024-10-23 11:01:05,306] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=17.926464030034182, CurrSamplesPerSec=18.82486069382099, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
242
+ [2024-10-23 11:01:08,570] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
243
+ [2024-10-23 11:01:08,571] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=18.731570052517945, CurrSamplesPerSec=19.83092777116667, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
244
+ [2024-10-23 11:01:53,444] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
245
+ [2024-10-23 11:01:56,743] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
246
+ Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
247
+ Saving 16-bit model...
248
+ [2024-10-23 11:04:33,468] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step118 is about to be saved!
249
+ [2024-10-23 11:04:33,469] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin, tag: global_step118
250
+ [2024-10-23 11:04:33,469] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin...
251
+ [2024-10-23 11:04:53,660] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin.
252
+ [2024-10-23 11:04:53,662] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step118 is ready now!
253
+ Model saved!
254
+ Saving 16-bit model...
255
+ [2024-10-23 11:05:03,419] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step118 is about to be saved!
256
+ [2024-10-23 11:05:03,420] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin, tag: global_step118
257
+ [2024-10-23 11:05:03,420] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin...
258
+ [2024-10-23 11:05:25,565] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin.
259
+ [2024-10-23 11:05:25,566] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step118 is ready now!
260
+ Model saved!
261
+ Model saved!
wandb/run-20241023_090557-paei5sn7/files/requirements.txt ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ align-anything==0.0.1.dev0
2
+ nvidia-cusolver-cu12==11.4.5.107
3
+ nvidia-curand-cu12==10.3.2.106
4
+ wcwidth==0.2.13
5
+ nvidia-cuda-nvrtc-cu12==12.1.105
6
+ pycparser==2.22
7
+ tokenizers==0.19.1
8
+ grpcio==1.66.2
9
+ joblib==1.4.2
10
+ virtualenv==20.26.6
11
+ pyzmq==26.2.0
12
+ cffi==1.17.1
13
+ sentencepiece==0.2.0
14
+ nvidia-nvtx-cu12==12.1.105
15
+ docker-pycreds==0.4.0
16
+ outlines==0.1.1.dev4+ga2fd35c
17
+ nvidia-nvjitlink-cu12==12.6.77
18
+ annotated-types==0.7.0
19
+ certifi==2024.8.30
20
+ interegular==0.3.3
21
+ Jinja2==3.1.4
22
+ Brotli==1.1.0
23
+ fairscale==0.4.13
24
+ gradio_client==1.4.0
25
+ opencv-python==4.6.0.66
26
+ pyarrow==17.0.0
27
+ absl-py==2.1.0
28
+ lm-format-enforcer==0.10.6
29
+ pydantic_core==2.23.4
30
+ llvmlite==0.43.0
31
+ accelerate==1.0.1
32
+ pytest-split==0.8.0
33
+ nvidia-cuda-cupti-cu12==12.1.105
34
+ watchfiles==0.24.0
35
+ optree==0.13.0
36
+ py-cpuinfo==9.0.0
37
+ scikit-learn==1.5.2
38
+ ftfy==6.3.0
39
+ fastapi==0.115.0
40
+ psutil==6.0.0
41
+ MarkupSafe==2.1.5
42
+ nvidia-cublas-cu12==12.1.3.1
43
+ pip==24.2
44
+ websockets==12.0
45
+ tomlkit==0.12.0
46
+ torchaudio==2.4.0
47
+ huggingface-hub==0.25.2
48
+ mistral_common==1.4.4
49
+ image-reward==1.5
50
+ pyparsing==3.1.4
51
+ aiohappyeyeballs==2.4.3
52
+ click==8.1.7
53
+ httptools==0.6.1
54
+ decorator==4.4.2
55
+ tqdm==4.66.5
56
+ fonttools==4.54.1
57
+ kiwisolver==1.4.7
58
+ ruff==0.6.9
59
+ openai==1.51.2
60
+ partial-json-parser==0.2.1.1.post4
61
+ xformers==0.0.27.post2
62
+ distlib==0.3.9
63
+ GitPython==3.1.43
64
+ pytest==7.2.0
65
+ imageio==2.35.1
66
+ msgspec==0.18.6
67
+ proglog==0.1.10
68
+ yarl==1.15.0
69
+ markdown-it-py==3.0.0
70
+ PyYAML==6.0.2
71
+ xxhash==3.5.0
72
+ braceexpand==0.1.7
73
+ datasets==3.0.1
74
+ mpmath==1.3.0
75
+ distro==1.9.0
76
+ term-image==0.7.2
77
+ python-dotenv==1.0.1
78
+ semantic-version==2.10.0
79
+ multidict==6.1.0
80
+ vllm==0.6.2
81
+ sentry-sdk==2.16.0
82
+ idna==3.10
83
+ starlette==0.38.6
84
+ args==0.1.0
85
+ peft==0.13.2
86
+ librosa==0.10.2.post1
87
+ urllib3==2.2.3
88
+ python-dateutil==2.9.0.post0
89
+ pycountry==24.6.1
90
+ six==1.16.0
91
+ ffmpy==0.4.0
92
+ multiprocess==0.70.16
93
+ cycler==0.12.1
94
+ charset-normalizer==3.4.0
95
+ aiofiles==23.2.1
96
+ shellingham==1.5.4
97
+ propcache==0.2.0
98
+ lark==1.2.2
99
+ torch==2.4.0
100
+ Werkzeug==3.0.4
101
+ nvidia-cusparse-cu12==12.1.0.106
102
+ clip==0.2.0
103
+ hjson==3.1.0
104
+ diffusers==0.30.3
105
+ attrs==24.2.0
106
+ lazy_loader==0.4
107
+ numpy==1.26.4
108
+ rpds-py==0.20.0
109
+ pytz==2024.2
110
+ audioread==3.0.1
111
+ platformdirs==4.3.6
112
+ deepspeed==0.15.2
113
+ gguf==0.10.0
114
+ wandb==0.18.3
115
+ prometheus_client==0.21.0
116
+ gitdb==4.0.11
117
+ packaging==24.1
118
+ sympy==1.13.3
119
+ mutagen==1.47.0
120
+ contourpy==1.3.0
121
+ pluggy==1.5.0
122
+ python-multipart==0.0.12
123
+ soundfile==0.12.1
124
+ typer==0.12.5
125
+ timm==0.6.13
126
+ frozenlist==1.4.1
127
+ httpx==0.27.2
128
+ mmsg==0.1.dev20+g585c63a.d20241012
129
+ tiktoken==0.7.0
130
+ pydub==0.25.1
131
+ diskcache==5.6.3
132
+ einops==0.8.0
133
+ setproctitle==1.3.3
134
+ scipy==1.14.1
135
+ typing_extensions==4.12.2
136
+ httpcore==1.0.6
137
+ cfgv==3.4.0
138
+ requests==2.32.3
139
+ torchlibrosa==0.1.0
140
+ pydantic==2.9.2
141
+ torchvision==0.19.0
142
+ sniffio==1.3.1
143
+ pyairports==2.1.1
144
+ hpsv2==1.2.0
145
+ protobuf==3.20.3
146
+ wheel==0.44.0
147
+ smmap==5.0.1
148
+ zipp==3.20.2
149
+ iniconfig==2.0.0
150
+ airportsdata==20241001
151
+ clint==0.5.1
152
+ pooch==1.8.2
153
+ shortuuid==1.0.13
154
+ pycryptodomex==3.21.0
155
+ cloudpickle==3.1.0
156
+ transformers==4.44.0.dev0
157
+ regex==2024.9.11
158
+ numba==0.60.0
159
+ tzdata==2024.2
160
+ orjson==3.10.7
161
+ jsonschema-specifications==2024.10.1
162
+ safetensors==0.4.5
163
+ outlines_core==0.1.0
164
+ filelock==3.16.1
165
+ threadpoolctl==3.5.0
166
+ soxr==0.5.0.post1
167
+ nvidia-cufft-cu12==11.0.2.54
168
+ networkx==3.4.1
169
+ msgpack==1.1.0
170
+ pandas==2.2.3
171
+ align-anything==0.0.1.dev0
172
+ anyio==4.6.0
173
+ nvidia-cuda-runtime-cu12==12.1.105
174
+ bitsandbytes==0.44.1
175
+ aiohttp==3.10.10
176
+ matplotlib==3.9.2
177
+ triton==3.0.0
178
+ tensorboard==2.18.0
179
+ nodeenv==1.9.1
180
+ fsspec==2024.6.1
181
+ webdataset==0.2.100
182
+ imageio-ffmpeg==0.5.1
183
+ mdurl==0.1.2
184
+ identify==2.6.1
185
+ h11==0.14.0
186
+ uvloop==0.20.0
187
+ rich==13.9.2
188
+ frechet-audio-distance==0.1.2
189
+ uvicorn==0.31.1
190
+ pytorch-fid==0.3.0
191
+ yt-dlp==2024.8.6
192
+ jiter==0.6.1
193
+ nest-asyncio==1.6.0
194
+ pre_commit==4.0.1
195
+ referencing==0.35.1
196
+ resampy==0.4.3
197
+ tensorboard-data-server==0.7.2
198
+ importlib_metadata==8.5.0
199
+ aiosignal==1.3.1
200
+ dill==0.3.8
201
+ prometheus-fastapi-instrumentator==7.0.0
202
+ ninja==1.11.1.1
203
+ nvidia-ml-py==12.560.30
204
+ moviepy==1.0.3
205
+ nvidia-cudnn-cu12==9.1.0.70
206
+ Markdown==3.7
207
+ ray==2.37.0
208
+ gradio==5.0.2
209
+ jsonschema==4.23.0
210
+ Pygments==2.18.0
211
+ nvidia-nccl-cu12==2.20.5
212
+ pillow==10.4.0
213
+ setuptools==75.1.0
214
+ jaraco.text==3.12.1
215
+ inflect==7.3.1
216
+ jaraco.collections==5.1.0
217
+ autocommand==2.2.2
218
+ tomli==2.0.1
219
+ jaraco.context==5.3.0
220
+ jaraco.functools==4.0.1
221
+ importlib_resources==6.4.0
222
+ wheel==0.43.0
223
+ packaging==24.1
224
+ backports.tarfile==1.2.0
225
+ importlib_metadata==8.0.0
226
+ typing_extensions==4.12.2
227
+ zipp==3.19.2
228
+ typeguard==4.3.0
229
+ more-itertools==10.3.0
230
+ platformdirs==4.2.2
wandb/run-20241023_090557-paei5sn7/files/wandb-metadata.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-196-generic-x86_64-with-glibc2.31",
3
+ "python": "3.11.10",
4
+ "startedAt": "2024-10-23T09:05:57.415346Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--actor_model_name_or_path",
8
+ "/data/align-anything/hantao/models/0916_ti_to_ti_sft",
9
+ "--reward_model_name_or_path",
10
+ "/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800",
11
+ "--reward_critic_model_name_or_path",
12
+ "/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800",
13
+ "--train_datasets",
14
+ "/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs",
15
+ "--train_template",
16
+ "spavl_ti2ti",
17
+ "--train_data_files",
18
+ "ti2ti_llf_prompt_only_tokenize.pt",
19
+ "--output_dir",
20
+ "../outputs/ppo_ti2ti_llf_1023_step_800",
21
+ "--save_interval",
22
+ "30"
23
+ ],
24
+ "program": "-m align_anything.trainers.text_image_to_text_image.ppo",
25
+ "git": {
26
+ "remote": "https://github.com/PKU-Alignment/align-anything.git",
27
+ "commit": "6fde660afc9985323f147930eedf188a5699adc7"
28
+ },
29
+ "email": "[email protected]",
30
+ "root": "../outputs/ppo_ti2ti_llf_1023_step_800",
31
+ "host": "lyg0196",
32
+ "username": "align-anything",
33
+ "executable": "/home/align-anything/miniconda3/envs/hantao_cham/bin/python",
34
+ "cpu_count": 64,
35
+ "cpu_count_logical": 128,
36
+ "gpu": "[NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB]",
37
+ "gpu_count": 8,
38
+ "disk": {
39
+ "/": {
40
+ "total": "940744544256",
41
+ "used": "297219252224"
42
+ }
43
+ },
44
+ "memory": {
45
+ "total": "540647583744"
46
+ },
47
+ "cpu": {
48
+ "count": 64,
49
+ "countLogical": 128
50
+ },
51
+ "gpu_nvidia": [
52
+ {
53
+ "name": "NVIDIA A100-SXM4-80GB",
54
+ "memoryTotal": "85899345920",
55
+ "cudaCores": 6912,
56
+ "architecture": "Ampere"
57
+ },
58
+ {
59
+ "name": "NVIDIA A100-SXM4-80GB",
60
+ "memoryTotal": "85899345920",
61
+ "cudaCores": 6912,
62
+ "architecture": "Ampere"
63
+ },
64
+ {
65
+ "name": "NVIDIA A100-SXM4-80GB",
66
+ "memoryTotal": "85899345920",
67
+ "cudaCores": 6912,
68
+ "architecture": "Ampere"
69
+ },
70
+ {
71
+ "name": "NVIDIA A100-SXM4-80GB",
72
+ "memoryTotal": "85899345920",
73
+ "cudaCores": 6912,
74
+ "architecture": "Ampere"
75
+ },
76
+ {
77
+ "name": "NVIDIA A100-SXM4-80GB",
78
+ "memoryTotal": "85899345920",
79
+ "cudaCores": 6912,
80
+ "architecture": "Ampere"
81
+ },
82
+ {
83
+ "name": "NVIDIA A100-SXM4-80GB",
84
+ "memoryTotal": "85899345920",
85
+ "cudaCores": 6912,
86
+ "architecture": "Ampere"
87
+ },
88
+ {
89
+ "name": "NVIDIA A100-SXM4-80GB",
90
+ "memoryTotal": "85899345920",
91
+ "cudaCores": 6912,
92
+ "architecture": "Ampere"
93
+ },
94
+ {
95
+ "name": "NVIDIA A100-SXM4-80GB",
96
+ "memoryTotal": "85899345920",
97
+ "cudaCores": 6912,
98
+ "architecture": "Ampere"
99
+ }
100
+ ],
101
+ "cudaVersion": "12.4"
102
+ }
wandb/run-20241023_090557-paei5sn7/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/reward_critic_loss":0.40057873725891113,"train/reward_advantage":0.3540787100791931,"train/mean_generated_length":1,"_runtime":7168.249961391,"train/reward_value":-0.5498046875,"train/reward_critic_lr":5e-06,"train/kl_divergence":4.317548751831055,"_wandb":{"runtime":7168},"train/reward_with_kl_penalty":-0.1957259625196457,"train/max_generated_length":1,"train/actor_loss":-0.3540787100791931,"train/reward":-0.109375,"_step":236,"train/actor_lr":0,"train/reward_return":-0.1957259625196457,"train/step":236,"_timestamp":1.7296814635198205e+09}
wandb/run-20241023_090557-paei5sn7/logs/debug-internal.log ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-23T09:05:57.421439083Z","level":"INFO","msg":"using version","core version":"0.18.3"}
2
+ {"time":"2024-10-23T09:05:57.421466903Z","level":"INFO","msg":"created symlink","path":"../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug-core.log"}
3
+ {"time":"2024-10-23T09:05:57.425862224Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
4
+ {"time":"2024-10-23T09:05:57.446650446Z","level":"INFO","msg":"created new stream","id":"paei5sn7"}
5
+ {"time":"2024-10-23T09:05:57.44668639Z","level":"INFO","msg":"stream: started","id":"paei5sn7"}
6
+ {"time":"2024-10-23T09:05:57.446720716Z","level":"INFO","msg":"handler: started","stream_id":{"value":"paei5sn7"}}
7
+ {"time":"2024-10-23T09:05:57.446711826Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"paei5sn7"}}
8
+ {"time":"2024-10-23T09:05:57.446734515Z","level":"INFO","msg":"sender: started","stream_id":{"value":"paei5sn7"}}
9
+ {"time":"2024-10-23T09:05:58.065915529Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
10
+ {"time":"2024-10-23T09:05:58.068726401Z","level":"INFO","msg":"Starting system monitor"}
11
+ {"time":"2024-10-23T11:05:25.66533688Z","level":"INFO","msg":"Stopping system monitor"}
12
+ {"time":"2024-10-23T11:05:25.691953146Z","level":"INFO","msg":"Stopped system monitor"}
13
+ {"time":"2024-10-23T11:05:26.186981401Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
14
+ {"time":"2024-10-23T11:05:26.187015556Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
15
+ {"time":"2024-10-23T11:05:27.289810318Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
16
+ {"time":"2024-10-23T11:05:28.84134155Z","level":"INFO","msg":"stream: closing","id":"paei5sn7"}
17
+ {"time":"2024-10-23T11:05:28.841377348Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"paei5sn7"}}
18
+ {"time":"2024-10-23T11:05:28.841437021Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"paei5sn7"}}
19
+ {"time":"2024-10-23T11:05:28.841525942Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"paei5sn7"}}
20
+ {"time":"2024-10-23T11:05:28.842923992Z","level":"INFO","msg":"stream: closed","id":"paei5sn7"}
wandb/run-20241023_090557-paei5sn7/logs/debug.log ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
2
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Configure stats pid to 1071579
3
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
4
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
5
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
6
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
7
+ 2024-10-23 09:05:57,409 WARNING MainThread:1071579 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.ppo
8
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.ppo'}
9
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_init.py:_log_setup():532] Logging user logs to ../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug.log
11
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_init.py:_log_setup():533] Logging internal logs to ../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug-internal.log
12
+ 2024-10-23 09:05:57,409 INFO MainThread:1071579 [wandb_init.py:init():617] calling init triggers
13
+ 2024-10-23 09:05:57,410 INFO MainThread:1071579 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
14
+ config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_prompt_batch_size': 8, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 2, 'actor_gradient_checkpointing': True, 'critic_gradient_checkpointing': True, 'actor_lr': 1e-05, 'actor_lr_scheduler_type': 'cosine', 'actor_lr_warmup_ratio': 0.03, 'actor_weight_decay': 0.01, 'critic_lr': 5e-06, 'critic_lr_scheduler_type': 'constant', 'critic_lr_warmup_ratio': 0.03, 'critic_weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'kl_coeff': 0.02, 'clip_range_ratio': 0.2, 'clip_range_score': 50.0, 'clip_range_value': 5.0, 'ptx_coeff': 16.0, 'gamma': 1.0, 'gae_lambda': 0.95, 'normalize_reward': False, 'update_iters': 1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs', 'train_template': 'spavl_ti2ti', 'train_size': 5000, 'train_split': None, 'train_subset': None, 'train_data_files': 'ti2ti_llf_prompt_only_tokenize.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': [], 'ptx_datasets': None, 'ptx_template': None, 'ptx_size': None, 'ptx_subset': None, 'ptx_split': None, 'ptx_data_files': None, 'ptx_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'ppo', 'output_dir': '../outputs/ppo_ti2ti_llf_1023_step_800', 'cache_dir': None, 'save_interval': 30.0}, 'model_cfgs': {'actor_model_name_or_path': '/data/align-anything/hantao/models/0916_ti_to_ti_sft', 'reward_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800', 'reward_critic_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800', 'trust_remote_code': True, 'model_max_length': 2048, 'temperature': 1.0, 'top_p': 1.0, 'repetition_penalty': 1.0}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': None}
15
+ 2024-10-23 09:05:57,410 INFO MainThread:1071579 [wandb_init.py:init():667] starting backend
16
+ 2024-10-23 09:05:57,410 INFO MainThread:1071579 [wandb_init.py:init():671] sending inform_init request
17
+ 2024-10-23 09:05:57,414 INFO MainThread:1071579 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
18
+ 2024-10-23 09:05:57,415 INFO MainThread:1071579 [wandb_init.py:init():684] backend started and connected
19
+ 2024-10-23 09:05:57,418 INFO MainThread:1071579 [wandb_init.py:init():779] updated telemetry
20
+ 2024-10-23 09:05:57,429 INFO MainThread:1071579 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
21
+ 2024-10-23 09:05:58,062 INFO MainThread:1071579 [wandb_init.py:init():863] starting run threads in backend
22
+ 2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_console_start():2465] atexit reg
23
+ 2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_redirect():2313] redirect: wrap_raw
24
+ 2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_redirect():2378] Wrapping output streams.
25
+ 2024-10-23 09:05:58,195 INFO MainThread:1071579 [wandb_run.py:_redirect():2403] Redirects installed.
26
+ 2024-10-23 09:05:58,197 INFO MainThread:1071579 [wandb_init.py:init():907] run started, returning control to user process
27
+ 2024-10-23 11:05:25,658 INFO MainThread:1071579 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/paei5sn7
28
+ 2024-10-23 11:05:25,662 INFO MainThread:1071579 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
29
+ 2024-10-23 11:05:25,664 INFO MainThread:1071579 [wandb_run.py:_restore():2410] restore
30
+ 2024-10-23 11:05:25,664 INFO MainThread:1071579 [wandb_run.py:_restore():2416] restore done
31
+ 2024-10-23 11:05:28,807 INFO MainThread:1071579 [wandb_run.py:_footer_history_summary_info():4049] rendering history
32
+ 2024-10-23 11:05:28,809 INFO MainThread:1071579 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
33
+ 2024-10-23 11:05:28,839 INFO MainThread:1071579 [wandb_run.py:_footer_sync_info():4008] logging synced files
wandb/run-20241023_090557-paei5sn7/run-paei5sn7.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7875273453296c40e06ab7dc88ef1748a4c98f52eba4903dcc2530faa8b6a23d
3
+ size 6283361