htlou commited on Feb 5

Commit

64ec51a

verified ·

1 Parent(s): 8540a58

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

.gitattributes +1 -0
arguments.yaml +94 -0
config.json +0 -0
environ.txt +62 -0
preprocessor_config.json +28 -0
processor_config.json +5 -0
pytorch_model.bin +3 -0
script.sh +48 -0
special_tokens_map.json +37 -0
tokenizer.json +0 -0
tokenizer_config.json +0 -0
wandb/debug-internal.log +20 -0
wandb/debug.log +33 -0
wandb/run-20241023_090557-paei5sn7/files/config.yaml +143 -0
wandb/run-20241023_090557-paei5sn7/files/output.log +261 -0
wandb/run-20241023_090557-paei5sn7/files/requirements.txt +230 -0
wandb/run-20241023_090557-paei5sn7/files/wandb-metadata.json +102 -0
wandb/run-20241023_090557-paei5sn7/files/wandb-summary.json +1 -0
wandb/run-20241023_090557-paei5sn7/logs/debug-internal.log +20 -0
wandb/run-20241023_090557-paei5sn7/logs/debug.log +33 -0
wandb/run-20241023_090557-paei5sn7/run-paei5sn7.wandb +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241023_090557-paei5sn7/run-paei5sn7.wandb filter=lfs diff=lfs merge=lfs -text

arguments.yaml ADDED Viewed

	@@ -0,0 +1,94 @@

+bnb_cfgs:
+  bnb_4bit_compute_dtype: float16
+  bnb_4bit_quant_type: nf4
+  bnb_4bit_use_double_quant: true
+  load_in_4bit: true
+  load_in_8bit: false
+  use_bnb: false
+data_cfgs:
+  eval_data_files: null
+  eval_datasets: null
+  eval_optional_args: []
+  eval_size: null
+  eval_split: null
+  eval_subset: null
+  eval_template: null
+  ptx_data_files: null
+  ptx_datasets: null
+  ptx_optional_args: []
+  ptx_size: null
+  ptx_split: null
+  ptx_subset: null
+  ptx_template: null
+  train_data_files: ti2ti_llf_prompt_only_tokenize.pt
+  train_datasets: /data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs
+  train_optional_args: []
+  train_size: 5000
+  train_split: null
+  train_subset: null
+  train_template: spavl_ti2ti
+logger_cfgs:
+  cache_dir: null
+  log_project: align-anything
+  log_run_name: ppo
+  log_type: wandb
+  output_dir: ../outputs/ppo_ti2ti_llf_1023_step_800
+  save_interval: 30.0
+lora_cfgs:
+  inference_mode: false
+  lora_alpha: 16
+  lora_dropout: 0.1
+  r: 16
+  save_full_model: true
+  target_modules:
+  - q_proj
+  - v_proj
+  task_type: TaskType.CAUSAL_LM
+  use_lora: false
+model_cfgs:
+  actor_model_name_or_path: /data/align-anything/hantao/models/0916_ti_to_ti_sft
+  model_max_length: 2048
+  repetition_penalty: 1.0
+  reward_critic_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800
+  reward_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800
+  temperature: 1.0
+  top_p: 1.0
+  trust_remote_code: true
+special_tokens: null
+train_cfgs:
+  actor_gradient_checkpointing: true
+  actor_lr: 1.0e-05
+  actor_lr_scheduler_type: cosine
+  actor_lr_warmup_ratio: 0.03
+  actor_weight_decay: 0.01
+  adam_betas:
+  - 0.9
+  - 0.95
+  bf16: true
+  clip_range_ratio: 0.2
+  clip_range_score: 50.0
+  clip_range_value: 5.0
+  critic_gradient_checkpointing: true
+  critic_lr: 5.0e-06
+  critic_lr_scheduler_type: constant
+  critic_lr_warmup_ratio: 0.03
+  critic_weight_decay: 0.0
+  ds_cfgs: ds_z3_config.json
+  epochs: 3
+  eval_interval: 10
+  eval_strategy: epoch
+  fp16: false
+  freeze_language_model: true
+  freeze_mm_proj: true
+  freeze_vision_tower: false
+  gae_lambda: 0.95
+  gamma: 1.0
+  gradient_accumulation_steps: 2
+  kl_coeff: 0.02
+  normalize_reward: false
+  per_device_eval_batch_size: 8
+  per_device_prompt_batch_size: 8
+  per_device_train_batch_size: 8
+  ptx_coeff: 16.0
+  seed: 42
+  update_iters: 1

config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

environ.txt ADDED Viewed

	@@ -0,0 +1,62 @@

+CONDA_DEFAULT_ENV=hantao_cham
+CONDA_EXE=/home/align-anything/miniconda3/bin/conda
+CONDA_PREFIX=/home/align-anything/miniconda3/envs/hantao_cham
+CONDA_PREFIX_1=/home/align-anything/miniconda3
+CONDA_PREFIX_2=/home/align-anything/miniconda3/envs/hantao_cham
+CONDA_PREFIX_3=/home/align-anything/miniconda3/envs/hantao_stable
+CONDA_PREFIX_4=/home/align-anything/miniconda3/envs/hantao_cham
+CONDA_PREFIX_5=/home/align-anything/miniconda3/envs/hantao_stable
+CONDA_PREFIX_6=/home/align-anything/miniconda3/envs/hantao_cham
+CONDA_PREFIX_7=/home/align-anything/miniconda3/envs/hantao_stable
+CONDA_PROMPT_MODIFIER=(hantao_cham)
+CONDA_PYTHON_EXE=/home/align-anything/miniconda3/bin/python
+CONDA_SHLVL=8
+CRASHDIR=/etc/ShellCrash
+CROSS_RANK=0
+CROSS_SIZE=1
+CUDA_MODULE_LOADING=LAZY
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/2000/bus
+HOME=/home/align-anything
+LANG=en_US.UTF-8
+LD_LIBRARY_PATH=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/../../lib64:
+LESSCLOSE=/usr/bin/lesspipe %s %s
+LESSOPEN=| /usr/bin/lesspipe %s
+LOCAL_RANK=0
+LOCAL_SIZE=8
+LOGLEVEL=WARNING
+LOGNAME=align-anything
+LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=23139
+MOTD_SHOWN=pam
+OLDPWD=/data/align-anything/hantao/align-anything/projects/text_image_to_text_image
+PATH=/home/align-anything/miniconda3/envs/hantao_cham/bin:/home/align-anything/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
+PWD=/data/align-anything/hantao/align-anything/scripts
+PYGAME_HIDE_SUPPORT_PROMPT=1
+PYTHONHASHSEED=42
+PYTHONPATH=/data/align-anything/hantao/align-anything
+QT_QPA_FONTDIR=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/qt/fonts
+QT_QPA_PLATFORM_PLUGIN_PATH=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/qt/plugins
+RANK=0
+SHELL=/bin/bash
+SHLVL=3
+SSH_CLIENT=111.205.230.212 43947 30600
+SSH_CONNECTION=111.205.230.212 44215 10.10.212.196 30600
+SSH_TTY=/dev/pts/1
+TERM=screen
+TMUX=/tmp/tmux-2000/default,53635,1
+TMUX_PANE=%1
+USER=align-anything
+WANDB_API_KEY=7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33
+WANDB_MODE=online
+WANDB_SERVICE=2-1071579-tcp-localhost-45673
+WORLD_SIZE=8
+XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
+XDG_RUNTIME_DIR=/run/user/2000
+XDG_SESSION_CLASS=user
+XDG_SESSION_ID=15
+XDG_SESSION_TYPE=tty
+_=/home/align-anything/miniconda3/envs/hantao_cham/bin/deepspeed
+_CE_CONDA=
+_CE_M=

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 512,
+    "width": 512
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    1.0,
+    1.0,
+    1.0
+  ],
+  "image_processor_type": "ChameleonImageProcessor",
+  "image_std": [
+    1.0,
+    1.0,
+    1.0
+  ],
+  "processor_class": "ChameleonProcessor",
+  "resample": 1,
+  "rescale_factor": 0.0078,
+  "size": {
+    "shortest_edge": 512
+  }
+}

processor_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "image_seq_length": 1024,
+  "image_token": "<image>",
+  "processor_class": "ChameleonProcessor"
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e463e8c312e3fcebc9ff47999f2efbe72a12805c69e3ba885a8b38b4ebe8d478
+size 14165009930

script.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/usr/bin/env bash
+#
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Initialize variables
+# For wandb online logging
+export WANDB_API_KEY="7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33"
+# Source the setup script
+# source ./setup.sh
+export WANDB_MODE=online
+ACTOR_MODEL_NAME_OR_PATH="/data/align-anything/hantao/models/0916_ti_to_ti_sft"
+CRITIC_MODEL_NAME_OR_PATH="/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800"
+REWARD_MODEL_NAME_OR_PATH="/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800"
+TRAIN_DATASETS="/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs"
+PTX_DATASETS="/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs"
+OUTPUT_DIR="../outputs/ppo_ti2ti_llf_1023_step_800"
+# Source the setup script
+source ./setup.sh
+# Execute deepspeed command
+deepspeed \
+  --master_port ${MASTER_PORT} \
+  --module align_anything.trainers.text_image_to_text_image.ppo \
+  --actor_model_name_or_path ${ACTOR_MODEL_NAME_OR_PATH} \
+  --reward_model_name_or_path ${REWARD_MODEL_NAME_OR_PATH} \
+  --reward_critic_model_name_or_path ${CRITIC_MODEL_NAME_OR_PATH} \
+  --train_datasets ${TRAIN_DATASETS} \
+  --train_template spavl_ti2ti \
+  --train_data_files ti2ti_llf_prompt_only_tokenize.pt \
+  --output_dir ${OUTPUT_DIR} \
+  --save_interval 30

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<reserved08706>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/debug-internal.log ADDED Viewed

	@@ -0,0 +1,20 @@

+{"time":"2024-10-23T09:05:57.421439083Z","level":"INFO","msg":"using version","core version":"0.18.3"}
+{"time":"2024-10-23T09:05:57.421466903Z","level":"INFO","msg":"created symlink","path":"../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug-core.log"}
+{"time":"2024-10-23T09:05:57.425862224Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
+{"time":"2024-10-23T09:05:57.446650446Z","level":"INFO","msg":"created new stream","id":"paei5sn7"}
+{"time":"2024-10-23T09:05:57.44668639Z","level":"INFO","msg":"stream: started","id":"paei5sn7"}
+{"time":"2024-10-23T09:05:57.446720716Z","level":"INFO","msg":"handler: started","stream_id":{"value":"paei5sn7"}}
+{"time":"2024-10-23T09:05:57.446711826Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"paei5sn7"}}
+{"time":"2024-10-23T09:05:57.446734515Z","level":"INFO","msg":"sender: started","stream_id":{"value":"paei5sn7"}}
+{"time":"2024-10-23T09:05:58.065915529Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
+{"time":"2024-10-23T09:05:58.068726401Z","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-10-23T11:05:25.66533688Z","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-10-23T11:05:25.691953146Z","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-10-23T11:05:26.186981401Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
+{"time":"2024-10-23T11:05:26.187015556Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
+{"time":"2024-10-23T11:05:27.289810318Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2024-10-23T11:05:28.84134155Z","level":"INFO","msg":"stream: closing","id":"paei5sn7"}
+{"time":"2024-10-23T11:05:28.841377348Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"paei5sn7"}}
+{"time":"2024-10-23T11:05:28.841437021Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"paei5sn7"}}
+{"time":"2024-10-23T11:05:28.841525942Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"paei5sn7"}}
+{"time":"2024-10-23T11:05:28.842923992Z","level":"INFO","msg":"stream: closed","id":"paei5sn7"}

wandb/debug.log ADDED Viewed

	@@ -0,0 +1,33 @@

+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Configure stats pid to 1071579
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
+2024-10-23 09:05:57,409 WARNING MainThread:1071579 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.ppo
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.ppo'}
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_init.py:_log_setup():532] Logging user logs to ../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug.log
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_init.py:_log_setup():533] Logging internal logs to ../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug-internal.log
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_init.py:init():617] calling init triggers
+2024-10-23 09:05:57,410 INFO    MainThread:1071579 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
+config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_prompt_batch_size': 8, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 2, 'actor_gradient_checkpointing': True, 'critic_gradient_checkpointing': True, 'actor_lr': 1e-05, 'actor_lr_scheduler_type': 'cosine', 'actor_lr_warmup_ratio': 0.03, 'actor_weight_decay': 0.01, 'critic_lr': 5e-06, 'critic_lr_scheduler_type': 'constant', 'critic_lr_warmup_ratio': 0.03, 'critic_weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'kl_coeff': 0.02, 'clip_range_ratio': 0.2, 'clip_range_score': 50.0, 'clip_range_value': 5.0, 'ptx_coeff': 16.0, 'gamma': 1.0, 'gae_lambda': 0.95, 'normalize_reward': False, 'update_iters': 1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs', 'train_template': 'spavl_ti2ti', 'train_size': 5000, 'train_split': None, 'train_subset': None, 'train_data_files': 'ti2ti_llf_prompt_only_tokenize.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': [], 'ptx_datasets': None, 'ptx_template': None, 'ptx_size': None, 'ptx_subset': None, 'ptx_split': None, 'ptx_data_files': None, 'ptx_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'ppo', 'output_dir': '../outputs/ppo_ti2ti_llf_1023_step_800', 'cache_dir': None, 'save_interval': 30.0}, 'model_cfgs': {'actor_model_name_or_path': '/data/align-anything/hantao/models/0916_ti_to_ti_sft', 'reward_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800', 'reward_critic_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800', 'trust_remote_code': True, 'model_max_length': 2048, 'temperature': 1.0, 'top_p': 1.0, 'repetition_penalty': 1.0}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': None}
+2024-10-23 09:05:57,410 INFO    MainThread:1071579 [wandb_init.py:init():667] starting backend
+2024-10-23 09:05:57,410 INFO    MainThread:1071579 [wandb_init.py:init():671] sending inform_init request
+2024-10-23 09:05:57,414 INFO    MainThread:1071579 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-23 09:05:57,415 INFO    MainThread:1071579 [wandb_init.py:init():684] backend started and connected
+2024-10-23 09:05:57,418 INFO    MainThread:1071579 [wandb_init.py:init():779] updated telemetry
+2024-10-23 09:05:57,429 INFO    MainThread:1071579 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
+2024-10-23 09:05:58,062 INFO    MainThread:1071579 [wandb_init.py:init():863] starting run threads in backend
+2024-10-23 09:05:58,195 INFO    MainThread:1071579 [wandb_run.py:_console_start():2465] atexit reg
+2024-10-23 09:05:58,195 INFO    MainThread:1071579 [wandb_run.py:_redirect():2313] redirect: wrap_raw
+2024-10-23 09:05:58,195 INFO    MainThread:1071579 [wandb_run.py:_redirect():2378] Wrapping output streams.
+2024-10-23 09:05:58,195 INFO    MainThread:1071579 [wandb_run.py:_redirect():2403] Redirects installed.
+2024-10-23 09:05:58,197 INFO    MainThread:1071579 [wandb_init.py:init():907] run started, returning control to user process
+2024-10-23 11:05:25,658 INFO    MainThread:1071579 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/paei5sn7
+2024-10-23 11:05:25,662 INFO    MainThread:1071579 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
+2024-10-23 11:05:25,664 INFO    MainThread:1071579 [wandb_run.py:_restore():2410] restore
+2024-10-23 11:05:25,664 INFO    MainThread:1071579 [wandb_run.py:_restore():2416] restore done
+2024-10-23 11:05:28,807 INFO    MainThread:1071579 [wandb_run.py:_footer_history_summary_info():4049] rendering history
+2024-10-23 11:05:28,809 INFO    MainThread:1071579 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
+2024-10-23 11:05:28,839 INFO    MainThread:1071579 [wandb_run.py:_footer_sync_info():4008] logging synced files

wandb/run-20241023_090557-paei5sn7/files/config.yaml ADDED Viewed

	@@ -0,0 +1,143 @@

+_wandb:
+    value:
+        cli_version: 0.18.3
+        m: []
+        python_version: 3.11.10
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 55
+                - 71
+                - 83
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 55
+                - 71
+                - 83
+                - 98
+                - 105
+            "3":
+                - 2
+                - 13
+                - 16
+                - 23
+                - 55
+                - 61
+            "4": 3.11.10
+            "5": 0.18.3
+            "6": 4.44.0.dev0
+            "8":
+                - 5
+            "12": 0.18.3
+            "13": linux-x86_64
+bnb_cfgs:
+    value:
+        bnb_4bit_compute_dtype: float16
+        bnb_4bit_quant_type: nf4
+        bnb_4bit_use_double_quant: true
+        load_in_4bit: true
+        load_in_8bit: false
+        use_bnb: false
+data_cfgs:
+    value:
+        eval_data_files: null
+        eval_datasets: null
+        eval_optional_args: []
+        eval_size: null
+        eval_split: null
+        eval_subset: null
+        eval_template: null
+        ptx_data_files: null
+        ptx_datasets: null
+        ptx_optional_args: []
+        ptx_size: null
+        ptx_split: null
+        ptx_subset: null
+        ptx_template: null
+        train_data_files: ti2ti_llf_prompt_only_tokenize.pt
+        train_datasets: /data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs
+        train_optional_args: []
+        train_size: 5000
+        train_split: null
+        train_subset: null
+        train_template: spavl_ti2ti
+logger_cfgs:
+    value:
+        cache_dir: null
+        log_project: align-anything
+        log_run_name: ppo
+        log_type: wandb
+        output_dir: ../outputs/ppo_ti2ti_llf_1023_step_800
+        save_interval: 30
+lora_cfgs:
+    value:
+        inference_mode: false
+        lora_alpha: 16
+        lora_dropout: 0.1
+        r: 16
+        save_full_model: true
+        target_modules:
+            - q_proj
+            - v_proj
+        task_type: TaskType.CAUSAL_LM
+        use_lora: false
+model_cfgs:
+    value:
+        actor_model_name_or_path: /data/align-anything/hantao/models/0916_ti_to_ti_sft
+        model_max_length: 2048
+        repetition_penalty: 1
+        reward_critic_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800
+        reward_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800
+        temperature: 1
+        top_p: 1
+        trust_remote_code: true
+special_tokens:
+    value: null
+train_cfgs:
+    value:
+        actor_gradient_checkpointing: true
+        actor_lr: 1e-05
+        actor_lr_scheduler_type: cosine
+        actor_lr_warmup_ratio: 0.03
+        actor_weight_decay: 0.01
+        adam_betas:
+            - 0.9
+            - 0.95
+        bf16: true
+        clip_range_ratio: 0.2
+        clip_range_score: 50
+        clip_range_value: 5
+        critic_gradient_checkpointing: true
+        critic_lr: 5e-06
+        critic_lr_scheduler_type: constant
+        critic_lr_warmup_ratio: 0.03
+        critic_weight_decay: 0
+        ds_cfgs: ds_z3_config.json
+        epochs: 3
+        eval_interval: 10
+        eval_strategy: epoch
+        fp16: false
+        freeze_language_model: true
+        freeze_mm_proj: true
+        freeze_vision_tower: false
+        gae_lambda: 0.95
+        gamma: 1
+        gradient_accumulation_steps: 2
+        kl_coeff: 0.02
+        normalize_reward: false
+        per_device_eval_batch_size: 8
+        per_device_prompt_batch_size: 8
+        per_device_train_batch_size: 8
+        ptx_coeff: 16
+        seed: 42
+        update_iters: 1

wandb/run-20241023_090557-paei5sn7/files/output.log ADDED Viewed

	@@ -0,0 +1,261 @@

+***** Running training *****
+Training 1/3 epoch:   0%|                                                                                                                                                                                                                                                                                 | 0/237 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
+Training 1/3 epoch (reward -0.3069):  13%|██████████████████████████████▌                                                                                                                                                                                                                  | 30/237 [1:00:27<12:52:22, 223.88s/it]
+[2024-10-23 09:13:46,654] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:13:50,992] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:18:33,696] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:18:38,270] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:23:48,615] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:23:53,939] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:27:04,368] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:27:08,626] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:30:00,787] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:30:04,867] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:32:26,802] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:32:30,881] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:36:44,344] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:36:47,973] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:39:01,718] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:39:01,719] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[9.908858470377793e-06, 9.908858470377793e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 09:39:01,719] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=14.571748606590951, CurrSamplesPerSec=15.598687308172462, MemAllocated=33.2GB, MaxMemAllocated=48.11GB
+[2024-10-23 09:39:05,687] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:39:05,688] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 09:39:05,689] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=15.239809655137655, CurrSamplesPerSec=16.432232979486614, MemAllocated=33.2GB, MaxMemAllocated=48.11GB
+[2024-10-23 09:41:40,302] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:41:43,921] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:58:00,759] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 09:58:06,692] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:06:16,814] [WARNING] [stage3.py:2104:step] 3 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:06:22,976] [WARNING] [stage3.py:2104:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+Saving checkpoint at step 30 ...
+Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
+Saving 16-bit model...
+[2024-10-23 10:06:35,014] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step15 is about to be saved!
+[2024-10-23 10:06:35,016] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin, tag: global_step15
+[2024-10-23 10:06:35,016] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin...
+[2024-10-23 10:06:58,291] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin.
+[2024-10-23 10:06:58,293] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step15 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-23 10:07:09,145] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step15 is about to be saved!
+[2024-10-23 10:07:09,146] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin, tag: global_step15
+[2024-10-23 10:07:09,146] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin...
+[2024-10-23 10:07:32,380] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_30.bin.
+[2024-10-23 10:07:32,381] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step15 is ready now!
+Model saved!
+Model saved!
+Checkpoint saved.
+[2024-10-23 10:15:51,652] [WARNING] [stage3.py:2104:step] 3 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:15:57,685] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:16:30,368] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:16:33,757] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:17:06,123] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:17:09,627] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:17:35,330] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:17:38,641] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:18:10,448] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:18:10,449] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[9.470431355738257e-06, 9.470431355738257e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:18:10,450] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=14.073356193448515, CurrSamplesPerSec=18.590383641536306, MemAllocated=33.2GB, MaxMemAllocated=52.44GB
+[2024-10-23 10:18:13,794] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:18:13,795] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:18:13,795] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=14.764142588014744, CurrSamplesPerSec=19.524800757549972, MemAllocated=33.2GB, MaxMemAllocated=52.44GB
+[2024-10-23 10:18:45,865] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:18:49,172] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:19:24,641] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:19:27,927] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:19:54,182] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:19:57,506] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:20:51,612] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:20:54,879] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:21:17,299] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:21:20,616] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:21:42,642] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:21:45,941] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:22:12,002] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:22:15,333] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:22:36,950] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:22:40,236] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:23:01,348] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:23:01,349] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[8.70045279830626e-06, 8.70045279830626e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:23:01,350] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=15.461989587711498, CurrSamplesPerSec=19.013700639766963, MemAllocated=33.15GB, MaxMemAllocated=52.44GB
+[2024-10-23 10:23:04,658] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:23:04,659] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:23:04,659] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=16.198688238179816, CurrSamplesPerSec=19.81263984833098, MemAllocated=33.15GB, MaxMemAllocated=52.44GB
+Saving checkpoint at step 60 ...
+Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
+Saving 16-bit model...
+[2024-10-23 10:23:19,118] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step30 is about to be saved!
+[2024-10-23 10:23:19,120] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin, tag: global_step30
+[2024-10-23 10:23:19,120] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin...
+[2024-10-23 10:23:37,909] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin.
+[2024-10-23 10:23:37,912] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step30 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-23 10:23:49,578] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step30 is about to be saved!
+[2024-10-23 10:23:49,578] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin, tag: global_step30
+[2024-10-23 10:23:49,579] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin...
+[2024-10-23 10:24:12,803] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_60.bin.
+[2024-10-23 10:24:12,804] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step30 is ready now!
+Model saved!
+Model saved!
+Checkpoint saved.
+[2024-10-23 10:25:22,814] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:25:26,107] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:26:58,273] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:27:01,584] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:28:01,902] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[7.656028585269017e-06, 7.656028585269017e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:28:01,903] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=16.36638951494034, CurrSamplesPerSec=29.39789939491861, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
+[2024-10-23 10:28:05,224] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:28:05,225] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=17.12711182391296, CurrSamplesPerSec=30.71016374643415, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
+[2024-10-23 10:28:25,905] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:28:29,159] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:29:13,184] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:29:16,439] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:29:37,209] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:29:40,499] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+Saving checkpoint at step 90 ...
+Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
+Saving 16-bit model...
+[2024-10-23 10:30:18,686] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step45 is about to be saved!
+[2024-10-23 10:30:18,688] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin, tag: global_step45
+[2024-10-23 10:30:18,688] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin...
+[2024-10-23 10:30:38,714] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin.
+[2024-10-23 10:30:38,715] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step45 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-23 10:30:48,994] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step45 is about to be saved!
+[2024-10-23 10:30:48,995] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin, tag: global_step45
+[2024-10-23 10:30:48,995] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin...
+[2024-10-23 10:31:12,474] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_90.bin.
+[2024-10-23 10:31:12,476] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step45 is ready now!
+Model saved!
+Model saved!
+Checkpoint saved.
+[2024-10-23 10:33:08,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[6.41461888258465e-06, 6.41461888258465e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:33:08,192] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=16.840242031082227, CurrSamplesPerSec=18.082056043756573, MemAllocated=33.1GB, MaxMemAllocated=52.44GB
+[2024-10-23 10:33:11,652] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:33:11,653] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=17.604303267186207, CurrSamplesPerSec=19.299485552228777, MemAllocated=33.1GB, MaxMemAllocated=52.44GB
+[2024-10-23 10:37:08,335] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[5.068293368829755e-06, 5.068293368829755e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:37:08,336] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=17.145914370621718, CurrSamplesPerSec=19.351638771074402, MemAllocated=33.14GB, MaxMemAllocated=52.44GB
+[2024-10-23 10:37:11,634] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:37:11,635] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=17.89246566680119, CurrSamplesPerSec=19.82526412245786, MemAllocated=33.14GB, MaxMemAllocated=52.44GB
+Saving checkpoint at step 120 ...
+Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
+Saving 16-bit model...
+[2024-10-23 10:37:26,476] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step60 is about to be saved!
+[2024-10-23 10:37:26,477] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin, tag: global_step60
+[2024-10-23 10:37:26,478] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin...
+[2024-10-23 10:37:44,898] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin.
+[2024-10-23 10:37:44,900] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step60 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-23 10:37:55,791] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step60 is about to be saved!
+[2024-10-23 10:37:55,792] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin, tag: global_step60
+[2024-10-23 10:37:55,792] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin...
+[2024-10-23 10:38:18,798] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_120.bin.
+[2024-10-23 10:38:18,799] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step60 is ready now!
+Model saved!
+Model saved!
+Checkpoint saved.
+[2024-10-23 10:42:13,477] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[3.7169028483301333e-06, 3.7169028483301333e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:42:13,478] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=17.37727340710681, CurrSamplesPerSec=18.834808666541562, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
+[2024-10-23 10:42:16,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:42:16,796] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=18.13780306709264, CurrSamplesPerSec=19.724826811206178, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
+Saving checkpoint at step 150 ...
+Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
+Saving 16-bit model...
+[2024-10-23 10:44:30,480] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step75 is about to be saved!
+[2024-10-23 10:44:30,481] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin, tag: global_step75
+[2024-10-23 10:44:30,482] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin...
+[2024-10-23 10:44:50,645] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin.
+[2024-10-23 10:44:50,646] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step75 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-23 10:45:00,723] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step75 is about to be saved!
+[2024-10-23 10:45:00,724] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin, tag: global_step75
+[2024-10-23 10:45:00,725] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin...
+[2024-10-23 10:45:22,429] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_150.bin.
+[2024-10-23 10:45:22,430] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step75 is ready now!
+Model saved!
+Model saved!
+Checkpoint saved.
+[2024-10-23 10:47:09,650] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[2.4606737737909696e-06, 2.4606737737909696e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:47:09,651] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=17.61415636579215, CurrSamplesPerSec=18.93111536799049, MemAllocated=33.29GB, MaxMemAllocated=52.44GB
+[2024-10-23 10:47:13,054] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:47:13,055] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=18.395644821730034, CurrSamplesPerSec=19.45837151531338, MemAllocated=33.29GB, MaxMemAllocated=52.44GB
+[2024-10-23 10:48:21,406] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:48:24,761] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:48:45,180] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:48:48,458] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:50:43,651] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:50:46,969] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:51:07,962] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[1.3927749088052218e-06, 1.3927749088052218e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:51:07,962] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=17.746439174563342, CurrSamplesPerSec=17.855702558049163, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
+[2024-10-23 10:51:11,460] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:51:11,461] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=18.529656252919512, CurrSamplesPerSec=18.696832537132607, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
+Saving checkpoint at step 180 ...
+Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
+Saving 16-bit model...
+[2024-10-23 10:51:23,210] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step90 is about to be saved!
+[2024-10-23 10:51:23,211] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin, tag: global_step90
+[2024-10-23 10:51:23,211] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin...
+[2024-10-23 10:51:42,948] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin.
+[2024-10-23 10:51:42,950] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step90 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-23 10:51:51,561] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step90 is about to be saved!
+[2024-10-23 10:51:51,562] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin, tag: global_step90
+[2024-10-23 10:51:51,563] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin...
+[2024-10-23 10:52:13,036] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_180.bin.
+[2024-10-23 10:52:13,038] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step90 is ready now!
+Model saved!
+Model saved!
+Checkpoint saved.
+[2024-10-23 10:53:45,229] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:53:48,631] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 10:56:08,616] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[5.924074268766422e-07, 5.924074268766422e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:56:08,617] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=17.839255597575846, CurrSamplesPerSec=18.94653826925029, MemAllocated=33.12GB, MaxMemAllocated=52.44GB
+[2024-10-23 10:56:11,882] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 10:56:11,883] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=18.63801951889686, CurrSamplesPerSec=19.80692964117057, MemAllocated=33.12GB, MaxMemAllocated=52.44GB
+Saving checkpoint at step 210 ...
+Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
+Saving 16-bit model...
+[2024-10-23 10:58:21,816] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step105 is about to be saved!
+[2024-10-23 10:58:21,818] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin, tag: global_step105
+[2024-10-23 10:58:21,818] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin...
+[2024-10-23 10:58:38,899] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin.
+[2024-10-23 10:58:38,901] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step105 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-23 10:58:47,269] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step105 is about to be saved!
+[2024-10-23 10:58:47,270] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin, tag: global_step105
+[2024-10-23 10:58:47,270] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin...
+[2024-10-23 10:59:09,672] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model_210.bin.
+[2024-10-23 10:59:09,674] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step105 is ready now!
+Model saved!
+Model saved!
+Checkpoint saved.
+[2024-10-23 11:00:41,653] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 11:00:44,969] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 11:01:05,305] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[1.1893092270227724e-07, 1.1893092270227724e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 11:01:05,306] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=17.926464030034182, CurrSamplesPerSec=18.82486069382099, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
+[2024-10-23 11:01:08,570] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-23 11:01:08,571] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=18.731570052517945, CurrSamplesPerSec=19.83092777116667, MemAllocated=33.13GB, MaxMemAllocated=52.44GB
+[2024-10-23 11:01:53,444] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-23 11:01:56,743] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+Saving model to "../outputs/ppo_ti2ti_llf_1023_step_800" ...
+Saving 16-bit model...
+[2024-10-23 11:04:33,468] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step118 is about to be saved!
+[2024-10-23 11:04:33,469] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin, tag: global_step118
+[2024-10-23 11:04:33,469] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin...
+[2024-10-23 11:04:53,660] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin.
+[2024-10-23 11:04:53,662] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step118 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-23 11:05:03,419] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step118 is about to be saved!
+[2024-10-23 11:05:03,420] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin, tag: global_step118
+[2024-10-23 11:05:03,420] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin...
+[2024-10-23 11:05:25,565] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_llf_1023_step_800/pytorch_model.bin.
+[2024-10-23 11:05:25,566] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step118 is ready now!
+Model saved!
+Model saved!

wandb/run-20241023_090557-paei5sn7/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,230 @@

+align-anything==0.0.1.dev0
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-curand-cu12==10.3.2.106
+wcwidth==0.2.13
+nvidia-cuda-nvrtc-cu12==12.1.105
+pycparser==2.22
+tokenizers==0.19.1
+grpcio==1.66.2
+joblib==1.4.2
+virtualenv==20.26.6
+pyzmq==26.2.0
+cffi==1.17.1
+sentencepiece==0.2.0
+nvidia-nvtx-cu12==12.1.105
+docker-pycreds==0.4.0
+outlines==0.1.1.dev4+ga2fd35c
+nvidia-nvjitlink-cu12==12.6.77
+annotated-types==0.7.0
+certifi==2024.8.30
+interegular==0.3.3
+Jinja2==3.1.4
+Brotli==1.1.0
+fairscale==0.4.13
+gradio_client==1.4.0
+opencv-python==4.6.0.66
+pyarrow==17.0.0
+absl-py==2.1.0
+lm-format-enforcer==0.10.6
+pydantic_core==2.23.4
+llvmlite==0.43.0
+accelerate==1.0.1
+pytest-split==0.8.0
+nvidia-cuda-cupti-cu12==12.1.105
+watchfiles==0.24.0
+optree==0.13.0
+py-cpuinfo==9.0.0
+scikit-learn==1.5.2
+ftfy==6.3.0
+fastapi==0.115.0
+psutil==6.0.0
+MarkupSafe==2.1.5
+nvidia-cublas-cu12==12.1.3.1
+pip==24.2
+websockets==12.0
+tomlkit==0.12.0
+torchaudio==2.4.0
+huggingface-hub==0.25.2
+mistral_common==1.4.4
+image-reward==1.5
+pyparsing==3.1.4
+aiohappyeyeballs==2.4.3
+click==8.1.7
+httptools==0.6.1
+decorator==4.4.2
+tqdm==4.66.5
+fonttools==4.54.1
+kiwisolver==1.4.7
+ruff==0.6.9
+openai==1.51.2
+partial-json-parser==0.2.1.1.post4
+xformers==0.0.27.post2
+distlib==0.3.9
+GitPython==3.1.43
+pytest==7.2.0
+imageio==2.35.1
+msgspec==0.18.6
+proglog==0.1.10
+yarl==1.15.0
+markdown-it-py==3.0.0
+PyYAML==6.0.2
+xxhash==3.5.0
+braceexpand==0.1.7
+datasets==3.0.1
+mpmath==1.3.0
+distro==1.9.0
+term-image==0.7.2
+python-dotenv==1.0.1
+semantic-version==2.10.0
+multidict==6.1.0
+vllm==0.6.2
+sentry-sdk==2.16.0
+idna==3.10
+starlette==0.38.6
+args==0.1.0
+peft==0.13.2
+librosa==0.10.2.post1
+urllib3==2.2.3
+python-dateutil==2.9.0.post0
+pycountry==24.6.1
+six==1.16.0
+ffmpy==0.4.0
+multiprocess==0.70.16
+cycler==0.12.1
+charset-normalizer==3.4.0
+aiofiles==23.2.1
+shellingham==1.5.4
+propcache==0.2.0
+lark==1.2.2
+torch==2.4.0
+Werkzeug==3.0.4
+nvidia-cusparse-cu12==12.1.0.106
+clip==0.2.0
+hjson==3.1.0
+diffusers==0.30.3
+attrs==24.2.0
+lazy_loader==0.4
+numpy==1.26.4
+rpds-py==0.20.0
+pytz==2024.2
+audioread==3.0.1
+platformdirs==4.3.6
+deepspeed==0.15.2
+gguf==0.10.0
+wandb==0.18.3
+prometheus_client==0.21.0
+gitdb==4.0.11
+packaging==24.1
+sympy==1.13.3
+mutagen==1.47.0
+contourpy==1.3.0
+pluggy==1.5.0
+python-multipart==0.0.12
+soundfile==0.12.1
+typer==0.12.5
+timm==0.6.13
+frozenlist==1.4.1
+httpx==0.27.2
+mmsg==0.1.dev20+g585c63a.d20241012
+tiktoken==0.7.0
+pydub==0.25.1
+diskcache==5.6.3
+einops==0.8.0
+setproctitle==1.3.3
+scipy==1.14.1
+typing_extensions==4.12.2
+httpcore==1.0.6
+cfgv==3.4.0
+requests==2.32.3
+torchlibrosa==0.1.0
+pydantic==2.9.2
+torchvision==0.19.0
+sniffio==1.3.1
+pyairports==2.1.1
+hpsv2==1.2.0
+protobuf==3.20.3
+wheel==0.44.0
+smmap==5.0.1
+zipp==3.20.2
+iniconfig==2.0.0
+airportsdata==20241001
+clint==0.5.1
+pooch==1.8.2
+shortuuid==1.0.13
+pycryptodomex==3.21.0
+cloudpickle==3.1.0
+transformers==4.44.0.dev0
+regex==2024.9.11
+numba==0.60.0
+tzdata==2024.2
+orjson==3.10.7
+jsonschema-specifications==2024.10.1
+safetensors==0.4.5
+outlines_core==0.1.0
+filelock==3.16.1
+threadpoolctl==3.5.0
+soxr==0.5.0.post1
+nvidia-cufft-cu12==11.0.2.54
+networkx==3.4.1
+msgpack==1.1.0
+pandas==2.2.3
+align-anything==0.0.1.dev0
+anyio==4.6.0
+nvidia-cuda-runtime-cu12==12.1.105
+bitsandbytes==0.44.1
+aiohttp==3.10.10
+matplotlib==3.9.2
+triton==3.0.0
+tensorboard==2.18.0
+nodeenv==1.9.1
+fsspec==2024.6.1
+webdataset==0.2.100
+imageio-ffmpeg==0.5.1
+mdurl==0.1.2
+identify==2.6.1
+h11==0.14.0
+uvloop==0.20.0
+rich==13.9.2
+frechet-audio-distance==0.1.2
+uvicorn==0.31.1
+pytorch-fid==0.3.0
+yt-dlp==2024.8.6
+jiter==0.6.1
+nest-asyncio==1.6.0
+pre_commit==4.0.1
+referencing==0.35.1
+resampy==0.4.3
+tensorboard-data-server==0.7.2
+importlib_metadata==8.5.0
+aiosignal==1.3.1
+dill==0.3.8
+prometheus-fastapi-instrumentator==7.0.0
+ninja==1.11.1.1
+nvidia-ml-py==12.560.30
+moviepy==1.0.3
+nvidia-cudnn-cu12==9.1.0.70
+Markdown==3.7
+ray==2.37.0
+gradio==5.0.2
+jsonschema==4.23.0
+Pygments==2.18.0
+nvidia-nccl-cu12==2.20.5
+pillow==10.4.0
+setuptools==75.1.0
+jaraco.text==3.12.1
+inflect==7.3.1
+jaraco.collections==5.1.0
+autocommand==2.2.2
+tomli==2.0.1
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+importlib_resources==6.4.0
+wheel==0.43.0
+packaging==24.1
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+typing_extensions==4.12.2
+zipp==3.19.2
+typeguard==4.3.0
+more-itertools==10.3.0
+platformdirs==4.2.2

wandb/run-20241023_090557-paei5sn7/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,102 @@

+{
+  "os":  "Linux-5.4.0-196-generic-x86_64-with-glibc2.31",
+  "python":  "3.11.10",
+  "startedAt":  "2024-10-23T09:05:57.415346Z",
+  "args":  [
+    "--local_rank=0",
+    "--actor_model_name_or_path",
+    "/data/align-anything/hantao/models/0916_ti_to_ti_sft",
+    "--reward_model_name_or_path",
+    "/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800",
+    "--reward_critic_model_name_or_path",
+    "/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800",
+    "--train_datasets",
+    "/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs",
+    "--train_template",
+    "spavl_ti2ti",
+    "--train_data_files",
+    "ti2ti_llf_prompt_only_tokenize.pt",
+    "--output_dir",
+    "../outputs/ppo_ti2ti_llf_1023_step_800",
+    "--save_interval",
+    "30"
+  ],
+  "program":  "-m align_anything.trainers.text_image_to_text_image.ppo",
+  "git":  {
+    "remote":  "https://github.com/PKU-Alignment/align-anything.git",
+    "commit":  "6fde660afc9985323f147930eedf188a5699adc7"
+  },
+  "email":  "[email protected]",
+  "root":  "../outputs/ppo_ti2ti_llf_1023_step_800",
+  "host":  "lyg0196",
+  "username":  "align-anything",
+  "executable":  "/home/align-anything/miniconda3/envs/hantao_cham/bin/python",
+  "cpu_count":  64,
+  "cpu_count_logical":  128,
+  "gpu":  "[NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB]",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "940744544256",
+      "used":  "297219252224"
+    }
+  },
+  "memory":  {
+    "total":  "540647583744"
+  },
+  "cpu":  {
+    "count":  64,
+    "countLogical":  128
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "12.4"
+}

wandb/run-20241023_090557-paei5sn7/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"train/reward_critic_loss":0.40057873725891113,"train/reward_advantage":0.3540787100791931,"train/mean_generated_length":1,"_runtime":7168.249961391,"train/reward_value":-0.5498046875,"train/reward_critic_lr":5e-06,"train/kl_divergence":4.317548751831055,"_wandb":{"runtime":7168},"train/reward_with_kl_penalty":-0.1957259625196457,"train/max_generated_length":1,"train/actor_loss":-0.3540787100791931,"train/reward":-0.109375,"_step":236,"train/actor_lr":0,"train/reward_return":-0.1957259625196457,"train/step":236,"_timestamp":1.7296814635198205e+09}

wandb/run-20241023_090557-paei5sn7/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,20 @@

+{"time":"2024-10-23T09:05:57.421439083Z","level":"INFO","msg":"using version","core version":"0.18.3"}
+{"time":"2024-10-23T09:05:57.421466903Z","level":"INFO","msg":"created symlink","path":"../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug-core.log"}
+{"time":"2024-10-23T09:05:57.425862224Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
+{"time":"2024-10-23T09:05:57.446650446Z","level":"INFO","msg":"created new stream","id":"paei5sn7"}
+{"time":"2024-10-23T09:05:57.44668639Z","level":"INFO","msg":"stream: started","id":"paei5sn7"}
+{"time":"2024-10-23T09:05:57.446720716Z","level":"INFO","msg":"handler: started","stream_id":{"value":"paei5sn7"}}
+{"time":"2024-10-23T09:05:57.446711826Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"paei5sn7"}}
+{"time":"2024-10-23T09:05:57.446734515Z","level":"INFO","msg":"sender: started","stream_id":{"value":"paei5sn7"}}
+{"time":"2024-10-23T09:05:58.065915529Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
+{"time":"2024-10-23T09:05:58.068726401Z","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-10-23T11:05:25.66533688Z","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-10-23T11:05:25.691953146Z","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-10-23T11:05:26.186981401Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
+{"time":"2024-10-23T11:05:26.187015556Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
+{"time":"2024-10-23T11:05:27.289810318Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2024-10-23T11:05:28.84134155Z","level":"INFO","msg":"stream: closing","id":"paei5sn7"}
+{"time":"2024-10-23T11:05:28.841377348Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"paei5sn7"}}
+{"time":"2024-10-23T11:05:28.841437021Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"paei5sn7"}}
+{"time":"2024-10-23T11:05:28.841525942Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"paei5sn7"}}
+{"time":"2024-10-23T11:05:28.842923992Z","level":"INFO","msg":"stream: closed","id":"paei5sn7"}

wandb/run-20241023_090557-paei5sn7/logs/debug.log ADDED Viewed

	@@ -0,0 +1,33 @@

+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Configure stats pid to 1071579
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
+2024-10-23 09:05:57,409 WARNING MainThread:1071579 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.ppo
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.ppo'}
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_init.py:_log_setup():532] Logging user logs to ../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug.log
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_init.py:_log_setup():533] Logging internal logs to ../outputs/ppo_ti2ti_llf_1023_step_800/wandb/run-20241023_090557-paei5sn7/logs/debug-internal.log
+2024-10-23 09:05:57,409 INFO    MainThread:1071579 [wandb_init.py:init():617] calling init triggers
+2024-10-23 09:05:57,410 INFO    MainThread:1071579 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
+config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_prompt_batch_size': 8, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 2, 'actor_gradient_checkpointing': True, 'critic_gradient_checkpointing': True, 'actor_lr': 1e-05, 'actor_lr_scheduler_type': 'cosine', 'actor_lr_warmup_ratio': 0.03, 'actor_weight_decay': 0.01, 'critic_lr': 5e-06, 'critic_lr_scheduler_type': 'constant', 'critic_lr_warmup_ratio': 0.03, 'critic_weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'kl_coeff': 0.02, 'clip_range_ratio': 0.2, 'clip_range_score': 50.0, 'clip_range_value': 5.0, 'ptx_coeff': 16.0, 'gamma': 1.0, 'gae_lambda': 0.95, 'normalize_reward': False, 'update_iters': 1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs', 'train_template': 'spavl_ti2ti', 'train_size': 5000, 'train_split': None, 'train_subset': None, 'train_data_files': 'ti2ti_llf_prompt_only_tokenize.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': [], 'ptx_datasets': None, 'ptx_template': None, 'ptx_size': None, 'ptx_subset': None, 'ptx_split': None, 'ptx_data_files': None, 'ptx_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'ppo', 'output_dir': '../outputs/ppo_ti2ti_llf_1023_step_800', 'cache_dir': None, 'save_interval': 30.0}, 'model_cfgs': {'actor_model_name_or_path': '/data/align-anything/hantao/models/0916_ti_to_ti_sft', 'reward_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800', 'reward_critic_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_llf_1017/slice_800', 'trust_remote_code': True, 'model_max_length': 2048, 'temperature': 1.0, 'top_p': 1.0, 'repetition_penalty': 1.0}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': None}
+2024-10-23 09:05:57,410 INFO    MainThread:1071579 [wandb_init.py:init():667] starting backend
+2024-10-23 09:05:57,410 INFO    MainThread:1071579 [wandb_init.py:init():671] sending inform_init request
+2024-10-23 09:05:57,414 INFO    MainThread:1071579 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-23 09:05:57,415 INFO    MainThread:1071579 [wandb_init.py:init():684] backend started and connected
+2024-10-23 09:05:57,418 INFO    MainThread:1071579 [wandb_init.py:init():779] updated telemetry
+2024-10-23 09:05:57,429 INFO    MainThread:1071579 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
+2024-10-23 09:05:58,062 INFO    MainThread:1071579 [wandb_init.py:init():863] starting run threads in backend
+2024-10-23 09:05:58,195 INFO    MainThread:1071579 [wandb_run.py:_console_start():2465] atexit reg
+2024-10-23 09:05:58,195 INFO    MainThread:1071579 [wandb_run.py:_redirect():2313] redirect: wrap_raw
+2024-10-23 09:05:58,195 INFO    MainThread:1071579 [wandb_run.py:_redirect():2378] Wrapping output streams.
+2024-10-23 09:05:58,195 INFO    MainThread:1071579 [wandb_run.py:_redirect():2403] Redirects installed.
+2024-10-23 09:05:58,197 INFO    MainThread:1071579 [wandb_init.py:init():907] run started, returning control to user process
+2024-10-23 11:05:25,658 INFO    MainThread:1071579 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/paei5sn7
+2024-10-23 11:05:25,662 INFO    MainThread:1071579 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
+2024-10-23 11:05:25,664 INFO    MainThread:1071579 [wandb_run.py:_restore():2410] restore
+2024-10-23 11:05:25,664 INFO    MainThread:1071579 [wandb_run.py:_restore():2416] restore done
+2024-10-23 11:05:28,807 INFO    MainThread:1071579 [wandb_run.py:_footer_history_summary_info():4049] rendering history
+2024-10-23 11:05:28,809 INFO    MainThread:1071579 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
+2024-10-23 11:05:28,839 INFO    MainThread:1071579 [wandb_run.py:_footer_sync_info():4008] logging synced files

wandb/run-20241023_090557-paei5sn7/run-paei5sn7.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7875273453296c40e06ab7dc88ef1748a4c98f52eba4903dcc2530faa8b6a23d
+size 6283361