xingzhikb commited on Oct 23, 2024

Commit

002bd9b

0 Parent(s):

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.amltignore +17 -0
.gitattributes +2 -0
.gitignore +161 -0
LICENSE +201 -0
README.md +74 -0
amlt_configs/accelerate_config.yaml +18 -0
amlt_configs/accelerate_deepspeed_config.local.yaml +26 -0
amlt_configs/accelerate_deepspeed_config.yaml +26 -0
amlt_configs/debug-data_mount.yaml +52 -0
amlt_configs/debug-sca.yaml +65 -0
amlt_configs/debug.yaml +51 -0
amlt_configs/infer-sam_captioner-region_chunkify-eval_suite.yaml +69 -0
amlt_configs/infer-sca-eval_suite-ckpt.yaml +133 -0
amlt_configs/infer-sca-eval_suite-coco_instance_task_type_caption-last_model.yaml +95 -0
amlt_configs/infer-sca-eval_suite-vg-best_model.yaml +96 -0
amlt_configs/infer-sca-eval_suite-vg-last_model.yaml +96 -0
amlt_configs/post_process.sh +2 -0
amlt_configs/setup.sh +144 -0
amlt_configs/setup_accelerate_on_azure.sh +53 -0
amlt_configs/setup_eval_suite.sh +28 -0
amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml +126 -0
amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml +128 -0
amlt_configs/train-sca-ablat-model_arch-103123.yaml +112 -0
amlt_configs/train-sca-ablat-sam_size-110423.yaml +108 -0
amlt_configs/train-sca-ablat-timm.yaml +143 -0
amlt_configs/train-sca-ablat-weak_sup_data.yaml +327 -0
amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml +178 -0
amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml +196 -0
data/demo_cases/1000_IM-0003-1001.dcm.png +3 -0
data/demo_cases/1000_IM-0003-2001.dcm.png +3 -0
data/demo_cases/1000_IM-0003-3001.dcm.png +3 -0
data/demo_cases/1001_IM-0004-1001.dcm.png +3 -0
data/demo_cases/1001_IM-0004-1002.dcm.png +3 -0
data/demo_cases/1002_IM-0004-1001.dcm.png +3 -0
data/demo_cases/1002_IM-0004-2001.dcm.png +3 -0
data/demo_cases/1003_IM-0005-2002.dcm.png +3 -0
data/demo_cases/1004_IM-0005-1001.dcm.png +3 -0
data/demo_cases/1004_IM-0005-2001.dcm.png +3 -0
data/demo_cases/1005_IM-0006-1001.dcm.png +3 -0
data/demo_cases/1005_IM-0006-3003.dcm.png +3 -0
data/demo_cases/1006_IM-0007-1001.dcm.png +3 -0
data/demo_cases/1006_IM-0007-3003.dcm.png +3 -0
data/demo_cases/1007_IM-0008-1001.dcm.png +3 -0
data/demo_cases/1007_IM-0008-2001.dcm.png +3 -0
data/demo_cases/1007_IM-0008-3001.dcm.png +3 -0
data/demo_cases/1008_IM-0009-2001.dcm.png +3 -0
data/demo_cases/1008_IM-0009-4004.dcm.png +3 -0
data/demo_cases/1009_IM-0010-1001.dcm.png +3 -0
data/demo_cases/1009_IM-0010-2001.dcm.png +3 -0
data/demo_cases/100_IM-0002-1001.dcm.png +3 -0

.amltignore ADDED Viewed

	@@ -0,0 +1,17 @@

+/exp*
+/tmp
+/.mypy_cache
+__pycache__/
+/build
+/data
+/results
+*.egg-info/
+scripts/examples/
+.amltconfig
+/amlt
+.*cache/
+wandb/
+build/
+*.egg-info/

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ data/*/ filter=lfs diff=lfs merge=lfs -text
2	+ Medical-SAM2/data/** filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,161 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+wandb/
+# VSCode
+.vscode/*
+# !.vscode/settings.json
+# !.vscode/tasks.json
+# !.vscode/launch.json
+# !.vscode/extensions.json
+# Hydra
+.hydra
+multirun.yaml
+.submitit
+# These should be symlinked.
+exp
+.*cache/
+/tmp
+# Download data manually.
+data/all_instances_82K.jsonl
+data/alpaca_data.json
+data/user_oriented_instructions.jsonl
+# Ignore amlt files
+.amltconfig
+/amlt
+# Ignore slurm files
+**/*slurm*/**
+*.slurm

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,74 @@

+# Segment and Caption Anything
+The repository contains the official implementation of "Segment and Caption Anything"
+[Project Page](https://xk-huang.github.io/segment-caption-anything), [Paper](https://arxiv.org/abs/2312.00869)
+![teaser](./docs/teaser-github.svg)
+tl;dr
+1. Despite the absence of semantic labels in the training data, SAM implies high-level semantics sufficient for captioning.
+2. SCA (b) is a lightweight augmentation of SAM (a) with the ability to generate regional captions.
+3. On top of SAM architecture, we add a fixed pre-trained language mode, and a optimizable lightweight hybrid feature mixture whose training is cheap and scalable.
+<table>
+  <tr>
+    <td><img src="./docs/anything-mode-00.png.jpg" alt="anything-mode-00"></td>
+    <td><img src="./docs/anything-mode-03.png.jpg" alt="anything-mode-01"></td>
+  </tr>
+  <tr>
+    <td><img src="./docs/anything-mode-01.png.jpg" alt="anything-mode-02"></td>
+    <td><img src="./docs/anything-mode-02.png.jpg" alt="anything-mode-03"></td>
+  </tr>
+</table>
+News
+- [01/31/2024] Update the [paper](https://xk-huang.github.io/segment-caption-anything/files/segment-caption-anything.013124.pdf) and the [supp](https://xk-huang.github.io/segment-caption-anything/files/segment-caption-anything-supp.013124.pdf). Release code v0.0.2: bump transformers to 4.36.2, support mistral series, phi-2, zephyr; add experiments about SAM+Image Captioner+[V-CoT](https://github.com/ttengwang/Caption-Anything), and more.
+- [12/05/2023] Release paper, code v0.0.1, and project page!
+## Environment Preparation
+Please check [docs/ENV.md](docs/ENV.md).
+## Model Zoo
+Please check [docs/MODEL_ZOO.md](docs/MODEL_ZOO.md)
+## Gradio Demo
+Please check [docs/DEMO.md](docs/DEMO.md)
+## Running Training and Inference
+Please check [docs/USAGE.md](docs/USAGE.md).
+## Experiments and Evaluation
+Please check [docs/EVAL.md](docs/EVAL.md)
+## License
+The trained weights are licensed under the [Apache 2.0 license](https://github.com/xk-huang/segment-caption-anything/blob/1c810bfcfeb3b95cd4b1f502f8f30c46333d58b8/LICENSE).
+## Acknowledgement
+Deeply appreciate these wonderful open source projects: [transformers](https://github.com/huggingface/transformers), [accelerate](https://github.com/huggingface/accelerate), [deepspeed](https://github.com/microsoft/DeepSpeed), [detectron2](https://github.com/facebookresearch/detectron2), [hydra](https://github.com/facebookresearch/hydra), [timm](https://github.com/huggingface/pytorch-image-models), [gradio](https://github.com/gradio-app/gradio).
+## Citation
+If you find this repository useful, please consider giving a star ⭐ and citation 🦖:
+```
+@misc{xiaoke2023SCA,
+  title={{Segment and Caption Anything}},
+  author={Xiaoke, Huang and Jianfeng, Wang and Yansong, Tang and Zheng, Zhang and Han, Hu and Jiwen, Lu and Lijuan, Wang and Zicheng, Liu},
+  journal={arXiv},
+  volume={abs/2312.00869},
+  year={2023},
+}
+```

amlt_configs/accelerate_config.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+# https://huggingface.co/docs/transformers/v4.32.1/en/main_classes/trainer#using-accelerate-launcher-with-trainer
+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0  # change this for each node
+main_process_ip: node-0  # the machines on Azure is inter-connected, so you can directly config it according to `~/.ssh/config`
+main_process_port: 11451  # change this as you like
+main_training_function: main
+mixed_precision: fp16
+num_machines: 2  # change this for all nodes
+num_processes: 8  # changet this for all nodes. all the gpu processes among the nodes.
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

amlt_configs/accelerate_deepspeed_config.local.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+# https://huggingface.co/docs/transformers/v4.32.1/en/main_classes/trainer#using-accelerate-launcher-with-trainer
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+  gradient_clipping: 1.0
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0  # change this for each node
+main_process_ip: localhost  # the machines on Azure is inter-connected, so you can directly config it according to `~/.ssh/config`
+main_process_port: 11451  # change this as you like
+main_training_function: main
+mixed_precision: fp16
+num_machines: 1  # change this for all nodes
+num_processes: 1  # changet this for all nodes. all the gpu processes among the nodes.
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

amlt_configs/accelerate_deepspeed_config.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+# https://huggingface.co/docs/transformers/v4.32.1/en/main_classes/trainer#using-accelerate-launcher-with-trainer
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+  gradient_clipping: 1.0
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0  # change this for each node
+main_process_ip: node-0  # the machines on Azure is inter-connected, so you can directly config it according to `~/.ssh/config`
+main_process_port: 11451  # change this as you like
+main_training_function: main
+mixed_precision: fp16
+num_machines: 2  # change this for all nodes
+num_processes: 8  # changet this for all nodes. all the gpu processes among the nodes.
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

amlt_configs/debug-data_mount.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+env_defaults:
+  SHARED_CMD_ARGS: '
+    -m src.train
+    train_data=[vg-densecap-region_descriptions] eval_data=[vg-densecap-region_descriptions]
+    +model=base_sam_captioner
+    training.do_train=False
+    training.do_eval=False
+    training.do_inference=True
+    training.num_masks_per_sample=1
+    +data.streaming=False
+    training.max_eval_samples=10
+    training.max_train_samples=1
+    training.num_train_epochs=10
+    training.fp16=True
+    training.output_dir=$AMLT_OUTPUT_DIR
+    training.output_log_dir=$AMLT_LOGS_DIR
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+    training.dataloader_num_workers=4
+  '
+environment:
+  image: nvidia/pytorch:23.07-py3
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+jobs:
+  - name: sam_captioner-infer-debug
+    sku: G$NUM_GPUS
+    preemptible: False
+    process_count_per_node: 1  # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - . amlt_configs/post_process.sh
+      - accelerate launch --config_file amlt_configs/accelerate_config.yaml $SHARED_CMD_ARGS || . amlt_configs/post_process.sh
+      - . amlt_configs/post_process.sh
+    submit_args:
+      env:
+        AZFUSE_USE_FUSE: "1"
+        SHARED_MEMORY_PERCENT: 0.5
+      container_args:
+        shm_size: 256g

amlt_configs/debug-sca.yaml ADDED Viewed

	@@ -0,0 +1,65 @@

+env_defaults:
+  SHARED_CMD_ARGS: >
+    -m src.train
+    train_data='[vg-densecap-region_descriptions]' eval_data='[vg-densecap-region_descriptions]'
+    +model=base_sca
+    training.do_train=True
+    training.do_eval=True
+    training.do_inference=True
+    +data.streaming=False
+    training.max_eval_samples=800
+    training.max_steps=200000
+    training.fp16=True
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+    training.save_strategy=steps
+    training.save_steps=5000
+    training.save_total_limit=3
+    training.optim=adamw_torch
+    training.evaluate_before_train=True
+    training.per_device_train_batch_size=1
+    training.evaluation_strategy=steps
+    training.eval_steps=5000
+    training.logging_steps=1000
+    training.logging_first_step=True
+    training.lr_scheduler_type=constant
+    training.warmup_steps=2000
+    training.learning_rate=1e-4
+    model.lm_head_model_name_or_path=gpt2-large
+    training.dataloader_num_workers=4
+    training.num_masks_per_sample=8
+    model.num_caption_tokens=8
+    training.output_dir=$AMLT_OUTPUT_DIR
+    training.output_log_dir=$AMLT_LOGS_DIR
+    wandb.group=$AMLT_EXPERIMENT_NAME-$AMLT_DESCRIPTION
+    wandb.name=$AMLT_JOB_NAME
+environment:
+  image: nvidia/pytorch:23.07-py3
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+jobs:
+  - name: sca-debug
+    sku: G$NUM_GPUS
+    process_count_per_node: 1 # Each node should run 1 process
+    preemptible: False
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - . amlt_configs/post_process.sh
+      # - accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml $SHARED_CMD_ARGS || . amlt_configs/post_process.sh
+    submit_args:
+      env:
+        AZFUSE_USE_FUSE: "1"
+        SHARED_MEMORY_PERCENT: 0.5
+      container_args:
+        shm_size: 256g

amlt_configs/debug.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+env_defaults:
+  SHARED_CMD_ARGS: '
+    -m src.train
+    train_data=[vg-densecap-region_descriptions] eval_data=[vg-densecap-region_descriptions]
+    +model=base_sam_captioner
+    training.do_train=False
+    training.do_eval=False
+    training.do_inference=True
+    training.num_masks_per_sample=1
+    +data.streaming=False
+    training.max_eval_samples=10
+    training.max_train_samples=1
+    training.num_train_epochs=10
+    training.fp16=True
+    training.output_dir=$AMLT_OUTPUT_DIR
+    training.output_log_dir=$AMLT_LOGS_DIR
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+    training.dataloader_num_workers=4
+  '
+environment:
+  image: nvidia/pytorch:23.07-py3
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+jobs:
+  - name: sam_captioner-infer-debug
+    sku: G$NUM_GPUS
+    preemptible: False
+    process_count_per_node: 1  # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - . amlt_configs/post_process.sh
+      # - accelerate launch --config_file amlt_configs/accelerate_config.yaml $SHARED_CMD_ARGS || . amlt_configs/post_process.sh
+    submit_args:
+      env:
+        AZFUSE_USE_FUSE: "1"
+        SHARED_MEMORY_PERCENT: 0.5
+      container_args:
+        shm_size: 256g

amlt_configs/infer-sam_captioner-region_chunkify-eval_suite.yaml ADDED Viewed

	@@ -0,0 +1,69 @@

+env_defaults:
+  SHARED_CMD_ARGS: '
+    -m src.train
+    train_data=[vg-densecap-region_descriptions] eval_data=[vg-densecap-region_descriptions]
+    +model=base_sam_captioner
+    training.do_train=False
+    training.do_eval=False
+    training.do_inference=True
+    +data.streaming=False
+    training.fp16=True
+    training.output_dir=$AMLT_OUTPUT_DIR
+    training.output_log_dir=$AMLT_LOGS_DIR
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+    training.dataloader_num_workers=4
+  '
+environment:
+  image: nvidia/pytorch:22.12-py3  # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+# Salesforce/blip-image-captioning-large, Salesforce/blip-image-captioning-base, microsoft/git-large-coco, microsoft/git-large-textcaps, microsoft/git-large, microsoft/git-base-coco, microsoft/git-base-textcaps, microsoft/git-base
+# LM_MODEL='Salesforce/blip-image-captioning-large' && amlt run config.yaml :Salesforce/blip-image-captioning-large=$LM_MODEL --extra-args "model.captioner_model_name_or_path=$LM_MODEL"
+jobs:
+  - name: Salesforce/blip-image-captioning-large
+    sku: G$NUM_GPUS
+    preemptible: False
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - pip install pydantic==1.10.8  # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
+      - . amlt_configs/setup_eval_suite.sh
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      # caption
+      - DATASET=vg-densecap-region_descriptions
+      - accelerate launch --num_processes $NUM_GPUS $SHARED_CMD_ARGS model.captioner_model_name_or_path=Salesforce/blip-image-captioning-large train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-vg-densecap-region_descriptions/region_img_annot_caption/visual_genome.py-region_descriptions_v1.2.0-test.region_img.tsv
+      - DATASET=refcoco-google
+      - accelerate launch --num_processes $NUM_GPUS $SHARED_CMD_ARGS model.captioner_model_name_or_path=Salesforce/blip-image-captioning-large train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-refcoco-google/region_img_annot_caption/refcoco.py-refcoco-google-test.region_img.tsv
+      # concept
+      - DATASET=coco-instance
+      - accelerate launch --num_processes $NUM_GPUS $SHARED_CMD_ARGS model.captioner_model_name_or_path=Salesforce/blip-image-captioning-large train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # OOM and every slow
+      # - DATASET=objects365-local
+      # - accelerate launch --num_processes $NUM_GPUS $SHARED_CMD_ARGS model.captioner_model_name_or_path=Salesforce/blip-image-captioning-large train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # OOM and every slow
+      # - DATASET=v3det-local
+      # - accelerate launch --num_processes $NUM_GPUS $SHARED_CMD_ARGS model.captioner_model_name_or_path=Salesforce/blip-image-captioning-large train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+      container_args:
+        shm_size: 256g

amlt_configs/infer-sca-eval_suite-ckpt.yaml ADDED Viewed

	@@ -0,0 +1,133 @@

+env_defaults:
+  SHARED_CMD_ARGS: >-
+    -m src.train
+    +model=base_sca
+    training.do_train=False
+    training.do_eval=False
+    training.do_inference=True
+    training.fp16=True
+    wandb.log=False
+    training.dataloader_num_workers=4
+    training.output_log_dir=$AMLT_LOGS_DIR
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+environment:
+  image: nvidia/pytorch:22.12-py3  # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+jobs:
+  - name: infer-eval_suite
+    sku: G$NUM_GPUS
+    preemptible: False
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - pip install pydantic==1.10.8  # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
+      - . amlt_configs/setup_eval_suite.sh
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      # caption
+      - DATASET=vg-densecap-local
+      - >-
+        accelerate launch $SHARED_CMD_ARGS
+        train_data=[$$DATASET]
+        eval_data=[$$DATASET]
+        training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
+        $EXTRA_ARGS
+      - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      - DATASET=refcocog-google
+      - >-
+        accelerate launch $SHARED_CMD_ARGS
+        train_data=[$$DATASET]
+        eval_data=[$$DATASET]
+        training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
+        $EXTRA_ARGS
+      - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      - DATASET=refcoco-unc-split_testA
+      - >-
+        accelerate launch $SHARED_CMD_ARGS
+        train_data=[$$DATASET]
+        eval_data=[$$DATASET]
+        training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
+        $EXTRA_ARGS
+      - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      - DATASET=refcoco-unc-split_testB
+      - >-
+        accelerate launch $SHARED_CMD_ARGS
+        train_data=[$$DATASET]
+        eval_data=[$$DATASET]
+        training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
+        $EXTRA_ARGS
+      - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      - DATASET=refcoco+-unc-split_testA
+      - >-
+        accelerate launch $SHARED_CMD_ARGS
+        train_data=[$$DATASET]
+        eval_data=[$$DATASET]
+        training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
+        $EXTRA_ARGS
+      - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      - DATASET=refcoco+-unc-split_testB
+      - >-
+        accelerate launch $SHARED_CMD_ARGS
+        train_data=[$$DATASET]
+        eval_data=[$$DATASET]
+        training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
+        $EXTRA_ARGS
+      - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # concept
+      # - DATASET=coco-instance
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # OOM and every slow
+      # - DATASET=objects365-local
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # OOM and every slow
+      # - DATASET=v3det-local
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+      container_args:
+        shm_size: 256g
+# CKPT_PATHS=(
+#     /mnt/blob/weights/sca-weights.111823/finetune-gpt2_large-lr_1e_4-1xlr-lsj-bs_1-pretrain_1e_4_no_lsj_bs_32.111223.rr1-4x8-v100-32g-pre/checkpoint-100000
+#     /mnt/blob/weights/sca-weights.111823/gpt2-large-lsj-1xlr.110423.octo-4x8-v100-16g-no_pre/checkpoint-200000
+#     /mnt/blob/weights/sca-weights.111823/ollm3bv2-large-lsj-1xlr.110423.octo-4x8-v100-16g-no_pre/checkpoint-200000
+#     /mnt/blob/weights/sca-weights.111823/pretrain_1e_4_no_lsj_bs_32.110523.rr1-4x8-v100-32g-pre/checkpoint-100000
+# )
+# for CKPT_PATH in ${CKPT_PATHS[@]} ; do
+#     CKPT_NAME=$(basename $(dirname $CKPT_PATH))
+#     echo $CKPT_NAME
+#     amlt run \
+#     -d "" --extra-args "training.generation_num_beams=3 training.fp16_full_eval=True model.model_name_or_path=$CKPT_PATH model.lm_head_model_name_or_path=\$(python scripts/tools/get_sub_model_name_from_ckpt.py $CKPT_PATH lm) model.sam_model_name_or_path=facebook/sam-vit-huge" \
+#     -t msroctovc -w msroctows --no-pre \
+#     --sku G4-V100 \
+#     amlt_configs/infer-sca-eval_suite-ckpt.yaml \
+#     :0=$CKPT_NAME \
+#     `date +"%m%d%y"`.infer-ckpt-all_dataset \
+#     -y
+# done

amlt_configs/infer-sca-eval_suite-coco_instance_task_type_caption-last_model.yaml ADDED Viewed

	@@ -0,0 +1,95 @@

+env_defaults:
+  SHARED_CMD_ARGS: >-
+    -m src.train
+    +model=base_sca
+    training.do_train=False
+    training.do_eval=False
+    training.do_inference=True
+    training.fp16=True
+    training.output_log_dir=$AMLT_LOGS_DIR
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+    wandb.log=False
+    training.dataloader_num_workers=4
+environment:
+  image: nvidia/pytorch:22.12-py3  # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+jobs:
+  - name: infer-eval_suite
+    sku: G$NUM_GPUS
+    preemptible: False
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - pip install pydantic==1.10.8  # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
+      - . amlt_configs/setup_eval_suite.sh
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      # get best (or max step) model
+      - BEST_CKPT_STEP=$$(python scripts/tools/get_model_name_from_trainer_state.py $$AMLT_MAP_INPUT_DIR "last")
+      - BEST_CKPT_PATH=$$(find $$AMLT_MAP_INPUT_DIR -name '*checkpoint*' | grep $$BEST_CKPT_STEP | tail -n1)
+      # caption
+      - DATASET=coco-instance-task_type_caption-local
+      - >-
+        accelerate launch $SHARED_CMD_ARGS
+        train_data=[$$DATASET]
+        eval_data=[$$DATASET]
+        training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
+        model.model_name_or_path=$$BEST_CKPT_PATH
+        model.lm_head_model_name_or_path=$(python scripts/tools/get_sub_model_name_from_ckpt.py $$BEST_CKPT_PATH "lm")
+        model.sam_model_name_or_path=facebook/sam-vit-huge
+        $EXTRA_ARGS
+      - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-coco-instance-local/region_img_annot_caption/coco_instance-local.py-2017-validation.region_img.tsv
+      # - DATASET=refcocog-google
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-refcocog-google/region_img_annot_caption/refcoco.py-refcocog-google-validation.region_img.tsv
+      # - DATASET=refcoco-unc-split_testA
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # - DATASET=refcoco-unc-split_testB
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # - DATASET=refcoco+-unc-split_testA
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # - DATASET=refcoco+-unc-split_testB
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # concept
+      # - DATASET=coco-instance
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # OOM and every slow
+      # - DATASET=objects365-local
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # OOM and every slow
+      # - DATASET=v3det-local
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+      container_args:
+        shm_size: 256g

amlt_configs/infer-sca-eval_suite-vg-best_model.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+env_defaults:
+  SHARED_CMD_ARGS: >-
+    -m src.train
+    +model=base_sca
+    training.do_train=False
+    training.do_eval=False
+    training.do_inference=True
+    training.fp16=True
+    training.output_log_dir=$AMLT_LOGS_DIR
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+    wandb.log=False
+    training.dataloader_num_workers=4
+environment:
+  image: nvidia/pytorch:22.12-py3  # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+jobs:
+  - name: infer-eval_suite
+    sku: G$NUM_GPUS
+    preemptible: False
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - pip install pydantic==1.10.8  # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
+      - . amlt_configs/setup_eval_suite.sh
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      # get best (or max step) model
+      - BEST_CKPT_STEP=$$(python scripts/tools/get_model_name_from_trainer_state.py $$AMLT_MAP_INPUT_DIR "best")
+      - BEST_CKPT_PATH=$$(find $$AMLT_MAP_INPUT_DIR -name '*checkpoint*' | grep $$BEST_CKPT_STEP | tail -n1)
+      # caption
+      - DATASET=vg-densecap-region_descriptions
+      - >-
+        accelerate launch $SHARED_CMD_ARGS
+        train_data=[$$DATASET]
+        eval_data=[$$DATASET]
+        training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
+        model.model_name_or_path=$$BEST_CKPT_PATH
+        model.lm_head_model_name_or_path=$$(python scripts/tools/get_sub_model_name_from_ckpt.py $$BEST_CKPT_PATH "lm")
+        model.sam_model_name_or_path=facebook/sam-vit-huge
+        $EXTRA_ARGS
+      - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-vg-densecap-region_descriptions/region_img_annot_caption/visual_genome.py-region_descriptions_v1.2.0-test.region_img.tsv
+      # - DATASET=refcocog-google
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-refcocog-google/region_img_annot_caption/refcoco.py-refcocog-google-validation.region_img.tsv
+      # - DATASET=refcoco-unc-split_testA
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # - DATASET=refcoco-unc-split_testB
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # - DATASET=refcoco+-unc-split_testA
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # - DATASET=refcoco+-unc-split_testB
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # concept
+      # - DATASET=coco-instance
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # OOM and every slow
+      # - DATASET=objects365-local
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # OOM and every slow
+      # - DATASET=v3det-local
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+      container_args:
+        shm_size: 256g

amlt_configs/infer-sca-eval_suite-vg-last_model.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+env_defaults:
+  SHARED_CMD_ARGS: >-
+    -m src.train
+    +model=base_sca
+    training.do_train=False
+    training.do_eval=False
+    training.do_inference=True
+    training.fp16=True
+    training.output_log_dir=$AMLT_LOGS_DIR
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+    wandb.log=False
+    training.dataloader_num_workers=4
+environment:
+  image: nvidia/pytorch:22.12-py3  # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+jobs:
+  - name: infer-eval_suite
+    sku: G$NUM_GPUS
+    preemptible: False
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - pip install pydantic==1.10.8  # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
+      - . amlt_configs/setup_eval_suite.sh
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      # get best (or max step) model
+      - BEST_CKPT_STEP=$$(python scripts/tools/get_model_name_from_trainer_state.py $$AMLT_MAP_INPUT_DIR "last")
+      - BEST_CKPT_PATH=$$(find $$AMLT_MAP_INPUT_DIR -name '*checkpoint*' | grep $$BEST_CKPT_STEP | tail -n1)
+      # caption
+      - DATASET=vg-densecap-region_descriptions
+      - >-
+        accelerate launch $SHARED_CMD_ARGS
+        train_data=[$$DATASET]
+        eval_data=[$$DATASET]
+        training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
+        model.model_name_or_path=$$BEST_CKPT_PATH
+        model.lm_head_model_name_or_path=$$(python scripts/tools/get_sub_model_name_from_ckpt.py $$BEST_CKPT_PATH "lm")
+        model.sam_model_name_or_path=facebook/sam-vit-huge
+        $EXTRA_ARGS
+      - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-vg-densecap-region_descriptions/region_img_annot_caption/visual_genome.py-region_descriptions_v1.2.0-test.region_img.tsv
+      # - DATASET=refcocog-google
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-refcocog-google/region_img_annot_caption/refcoco.py-refcocog-google-validation.region_img.tsv
+      # - DATASET=refcoco-unc-split_testA
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # - DATASET=refcoco-unc-split_testB
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # - DATASET=refcoco+-unc-split_testA
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # - DATASET=refcoco+-unc-split_testB
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # concept
+      # - DATASET=coco-instance
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # OOM and every slow
+      # - DATASET=objects365-local
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+      # OOM and every slow
+      # - DATASET=v3det-local
+      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
+      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+      container_args:
+        shm_size: 256g

amlt_configs/post_process.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ echo "The main process failed, enter post_process.sh"
2	+ python -c "import time;time.sleep(100000)"

amlt_configs/setup.sh ADDED Viewed

	@@ -0,0 +1,144 @@

+#/bin/bash
+# Uninstall mlflow
+# pip uninstall -y mlflow-skinny
+# pip uninstall -y azureml-mlflow
+nvidia-smi
+# Download azcopy
+TMP_DIR=tmp/
+AZCOPY_URL=https://aka.ms/downloadazcopy-v10-linux
+AZCOPY_TAR_FILE="$TMP_DIR/azcopy-v10-linux.tar.gz"
+AZCOPY_FILE="$TMP_DIR/azcopy"
+"$AZCOPY_FILE" --version
+has_azcopy=$?
+if [[ has_azcopy -eq 0 ]]; then
+    echo "azcopy exists"
+else
+    echo "azcopy does not exist"
+    mkdir -p $TMP_DIR
+    wget $AZCOPY_URL -O $AZCOPY_TAR_FILE
+    file_to_be_extracted="$(tar -tvf $AZCOPY_TAR_FILE | grep -E 'azcopy$' | awk '{print $6}')"
+    tar -zxvf $AZCOPY_TAR_FILE  -C "$TMP_DIR" "$file_to_be_extracted"
+    mv $TMP_DIR/$file_to_be_extracted $TMP_DIR
+    rm $AZCOPY_TAR_FILE
+    rmdir "$(dirname $TMP_DIR/$file_to_be_extracted)"
+    chmod 777 $AZCOPY_FILE
+    export PATH=$PATH:$(pwd)/$TMP_DIR
+    echo "export PATH=\$PATH:$(pwd)/$TMP_DIR" >> ~/.bashrc
+fi
+# Install pip requirements
+pip install -r requirements.txt
+echo "export PATH=\$PATH:\$HOME/.local/bin" >> ~/.bashrc
+export PATH=$PATH:$HOME/.local/bin
+# Add wandb api
+# ref: https://docs.wandb.ai/guides/track/environment-variables
+MY_WANDB_API_KEY= 'YOUR_WANDB_API_KEY'
+export WANDB_API_KEY=$MY_WANDB_API_KEY
+echo "export WANDB_API_KEY=$MY_WANDB_API_KEY" >> ~/.bashrc
+# Show full error trace from hydra
+echo "export HYDRA_FULL_ERROR=1" >> ~/.bashrc
+# Change dataset to hg download
+TARGET_DATASETS_VER="2.13.1"
+version="$(pip show datasets | grep Version | awk '{print $2}')"
+if [[ $version == $TARGET_DATASETS_VER ]]; then
+    echo "datasets version is $TARGET_DATASETS_VER, changing it to use azcopy..."
+    pip_package_path="$(pip show datasets | grep Location | awk '{print $2}')"
+    download_file_path="$pip_package_path/datasets/utils/file_utils.py"
+    if [[ -f $download_file_path.bak ]]; then
+        cp $download_file_path.bak $download_file_path
+    fi
+    cp $download_file_path $download_file_path.bak
+    sed -i '609 i\
+            # NOTE(xiaoke): An intrusion to use azcopy to download from Azure blob storage\
+            elif "blob.core.windows.net" in url:\
+                process_id = -1\
+                try:\
+                    import torch\
+                    if torch.distributed.is_initialized():\
+                        process_id = torch.distributed.get_rank()\
+                except ImportError:\
+                    logger.warning("no torch found, cannot determine whether is in ddp mode")\
+                except RuntimeError:\
+                    logger.warning("torch.distributed is not initialized, cannot determine whether is in ddp mode")\
+\
+                logger.warning(f"[process {process_id}] Try to use azcopy to download from Azure blob storage")\
+                import subprocess\
+\
+                has_azcopy = subprocess.run(["azcopy"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).returncode\
+                if has_azcopy != 0:\
+                    logger.warning(f"[process {process_id}] azcopy not found, using http_get, which is slow")\
+                    http_get(\
+                        url,\
+                        temp_file,\
+                        proxies=proxies,\
+                        resume_size=resume_size,\
+                        headers=headers,\
+                        cookies=cookies,\
+                        max_retries=max_retries,\
+                        desc=download_desc,\
+                    )\
+                else:\
+                    logger.warning(f"[process {process_id}] azcopy found, using azcopy")\
+                    result = subprocess.run(\
+                        ["azcopy", "cp", url, temp_file.name],\
+                    )\
+                    if result.returncode != 0:\
+                        raise ConnectionError(\
+                            f"azcopy failed with return code {result.returncode}"\
+                        )\
+' $download_file_path
+else
+    echo "datasets version is NOT $TARGET_DATASETS_VER, not changed"
+fi
+# For debug
+sudo apt-get update
+if [[ $? -ne 0 ]]; then
+    apt-get update
+fi
+sudo apt-get install -y tmux htop vim lsof
+if [[ $? -ne 0 ]]; then
+    apt-get install -y tmux htop vim lsof
+fi
+# Tmux config
+curl -L https://raw.githubusercontent.com/hamvocke/dotfiles/master/tmux/.tmux.conf -o - >> ~/.tmux.conf
+# Vim config
+# Install vim-plug
+curl -fLo ~/.vim/autoload/plug.vim --create-dirs \
+    https://raw.githubusercontent.com/junegunn/vim-plug/master/plug.vim
+cat << EOF > ~/.vimrc
+set tabstop=4
+set shiftwidth=4
+set expandtab
+set smartindent
+set nu
+set hlsearch
+set ignorecase
+set mouse=a
+call plug#begin()
+Plug 'tpope/vim-surround'
+Plug 'tpope/vim-commentary'
+Plug 'davidhalter/jedi-vim'
+call plug#end()
+let g:jedi#force_py_version = 3 " Force using Python 3
+EOF
+vim +'PlugInstall --sync' +qa
+# Install gpustat
+pip install gpustat
+# echo pwd
+echo "pwd: $(pwd)"

amlt_configs/setup_accelerate_on_azure.sh ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/bin/bash
+source ~/.bashrc
+ACCELERATE_CONFIG_PATHS=(amlt_configs/accelerate_config.yaml amlt_configs/accelerate_deepspeed_config.yaml)
+if [[ -z "$WORLD_SIZE" ]]; then
+    echo "WORLD_SIZE is not set, using 1"
+    WORLD_SIZE=1
+fi
+if [[ -z "$NODE_RANK" ]]; then
+    echo "NODE_RANK is not set, using 0"
+    NODE_RANK=0
+fi
+NUM_GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
+((NUM_TOTAL_GPUS = WORLD_SIZE * NUM_GPUS_PER_NODE))
+echo "Setting up accelerate config:"
+echo "ACCELERATE_CONFIG_PATHS: ${ACCELERATE_CONFIG_PATHS[@]}"
+echo "NUM_TOTAL_GPUS: $NUM_TOTAL_GPUS"
+echo "NUM_GPUS_PER_NODE: $NUM_GPUS_PER_NODE"
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NODE_RANK: $NODE_RANK"
+echo "MASTER_ADDR: $MASTER_ADDR"
+echo "MASTER_PORT: $MASTER_PORT"
+function modify_accelerate_config()
+{
+    local ACCELERATE_CONFIG_PATH=$1
+    if [[ -z "$MASTER_ADDR" ]]; then
+        echo "MASTER_ADDR is not set, using localhost"
+        sed -i 's/main_process_ip.*//g' $ACCELERATE_CONFIG_PATH
+        sed -i 's/main_process_port.*//g' $ACCELERATE_CONFIG_PATH
+    else
+        sed -i 's/main_process_ip.*/main_process_ip: '"$MASTER_ADDR"'/g' $ACCELERATE_CONFIG_PATH
+        sed -i 's/main_process_port.*/main_process_port: '"$MASTER_PORT"'/g' $ACCELERATE_CONFIG_PATH
+    fi
+    sed -i 's/num_machines.*/num_machines: '"$WORLD_SIZE"'/g' $ACCELERATE_CONFIG_PATH
+    sed -i 's/machine_rank.*/machine_rank: '"$NODE_RANK"'/g' $ACCELERATE_CONFIG_PATH
+    sed -i 's/num_processes.*/num_processes: '"$NUM_TOTAL_GPUS"'/g' $ACCELERATE_CONFIG_PATH
+    accelerate env --config_file $ACCELERATE_CONFIG_PATH
+    # accelerate test --config_file $ACCELERATE_CONFIG_PATH  # It may cause bug..ValueError: To use a `DataLoader` in `split_batches` mode, the batch size (8) needs to be a round multiple of the number of processes (16).
+}
+for ACCELERATE_CONFIG_PATH in "${ACCELERATE_CONFIG_PATHS[@]}"; do
+    if [[ -f "$ACCELERATE_CONFIG_PATH" ]]; then
+        echo "ACCELERATE_CONFIG_PATH: $ACCELERATE_CONFIG_PATH exists, modifying it with env variables."
+        modify_accelerate_config $ACCELERATE_CONFIG_PATH
+    else
+        echo "ACCELERATE_CONFIG_PATH: $ACCELERATE_CONFIG_PATH does not exist"
+    fi
+done

amlt_configs/setup_eval_suite.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/bin/bash
+source ~/.bashrc
+ORIGINAL_DIR="$(pwd)"
+git clone --recursive https://github.com/xk-huang/vdtk.git /tmp/vdtk -b dev
+cd /tmp/vdtk
+git submodule update --init --recursive
+apt-get update
+sudo apt-get update
+apt-get install git-lfs gawk
+sudo apt-get install git-lfs gawk
+git lfs install
+git clone https://huggingface.co/xk-huang/vdtk-data
+# git submodule init && git submodule update
+rsync -avP ./vdtk-data/vdtk .
+rm -rf vdtk-data
+pip install --upgrade pip
+pip install -e . POT==0.9.0  # POT=0.9.1 will take up all the memory with tf backend
+pip install tensorflow==2.12.1  # Just fix one version of tf
+pip install levenshtein==0.21.1
+pip install openpyxl==3.1.2
+python3 -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"
+cd "$ORIGINAL_DIR"

amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml ADDED Viewed

	@@ -0,0 +1,126 @@

+env_defaults:
+  # NOTE: this kind of string leaded by > will append a new line to the end of the string
+  SHARED_CMD_ARGS: >-
+    -m src.train
+    +model=base_sca_multitask_v2
+    training.do_train=True
+    training.do_eval=True
+    training.do_inference=True
+    +data.streaming=False
+    training.max_eval_samples=800
+    training.max_steps=100000
+    training.fp16=True
+    training.output_dir=$AMLT_OUTPUT_DIR
+    training.output_log_dir=$AMLT_LOGS_DIR
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+    training.save_strategy=steps
+    training.save_steps=5000
+    training.save_total_limit=3
+    training.optim=adamw_torch
+    training.evaluate_before_train=True
+    training.per_device_train_batch_size=1
+    training.evaluation_strategy=steps
+    training.eval_steps=5000
+    training.logging_steps=1000
+    training.logging_first_step=True
+    training.dataloader_num_workers=4
+    training.num_masks_per_sample=16
+    wandb.project=$AMLT_EXPERIMENT_NAME
+    wandb.name=$AMLT_JOB_NAME
+    model.num_caption_tokens=8
+    model.additional_num_hidden_layers=12
+    model.num_task_tokens=6
+    training.lr_scheduler_type=cosine
+    model.lm_head_model_name_or_path=gpt2-large
+    training.learning_rate=1e-5
+    training.weight_decay=1e-4
+    training.warmup_steps=200
+    training.warmup_ratio=0.33333333
+    training.compute_metrics=True
+environment:
+  image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:23.06-py3  # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:22.12-py3  # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+jobs:
+  - name: gpt2-large
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}-V100-IB
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-region_descriptions]'
+        eval_data='[vg-densecap-region_descriptions]'
+        model.lm_head_model_name_or_path=gpt2-large
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+        # NCCL_IB_DISABLE: 1
+        # NCCL_IBEXT_DISABLE: 1
+      container_args:
+        shm_size: 256g
+  - name: open_llama_3b_v2
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}-V100-IB
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-region_descriptions]'
+        eval_data='[vg-densecap-region_descriptions]'
+        model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
+        training.gradient_checkpointing=true
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+        # NCCL_IB_DISABLE: 1
+        # NCCL_IBEXT_DISABLE: 1
+      container_args:
+        shm_size: 256g
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=1e-4" \
+# -t msroctovc -w msroctows --sku=G8-V100 --no-pre \
+# amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_4 train-sca-ablat-finetune-scale_lr-110423
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=1e-5" \
+# -t msroctovc -w msroctows --sku=G8-V100 --no-pre \
+# amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_5 train-sca-ablat-finetune-scale_lr-110423
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-5" \
+# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
+# amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_5-1xlr train-sca-ablat-finetune-scale_lr-110423
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4" \
+# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
+# amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_4-1xlr train-sca-ablat-finetune-scale_lr-110423

amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml ADDED Viewed

	@@ -0,0 +1,128 @@

+env_defaults:
+  # NOTE: this kind of string leaded by > will append a new line to the end of the string
+  SHARED_CMD_ARGS: >-
+    -m src.train
+    +model=base_sca_multitask_v2
+    training.do_train=True
+    training.do_eval=True
+    training.do_inference=True
+    +data.streaming=False
+    training.max_eval_samples=800
+    training.max_steps=200000
+    training.fp16=True
+    training.output_dir=$AMLT_OUTPUT_DIR
+    training.output_log_dir=$AMLT_LOGS_DIR
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+    training.save_strategy=steps
+    training.save_steps=5000
+    training.save_total_limit=3
+    training.optim=adamw_torch
+    training.evaluate_before_train=True
+    training.per_device_train_batch_size=1
+    training.evaluation_strategy=steps
+    training.eval_steps=5000
+    training.logging_steps=1000
+    training.logging_first_step=True
+    training.dataloader_num_workers=4
+    training.num_masks_per_sample=16
+    wandb.project=$AMLT_EXPERIMENT_NAME
+    wandb.name=$AMLT_JOB_NAME
+    model.num_caption_tokens=8
+    model.additional_num_hidden_layers=12
+    model.num_task_tokens=6
+    training.lr_scheduler_type=cosine
+    model.lm_head_model_name_or_path=gpt2-large
+    training.learning_rate=1e-4
+    training.weight_decay=1e-4
+    training.warmup_steps=200
+    training.warmup_ratio=0.33333333
+    training.compute_metrics=True
+environment:
+  image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:23.06-py3  # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:22.12-py3  # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+jobs:
+  - name: gpt2-large
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-local]'
+        eval_data='[vg-densecap-local]'
+        model.lm_head_model_name_or_path=gpt2-large
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+      container_args:
+        shm_size: 256g
+  - name: open_llama_3b_v2
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-local]'
+        eval_data='[vg-densecap-local]'
+        model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
+        training.gradient_checkpointing=true
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+      container_args:
+        shm_size: 256g
+# sing resrch 1x8 no-pre lsj
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0" -t msrresrchvc -w msrresrchws --sku=G8-V100 --no-pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.resrch-1x8-v100-16g-no_pre.ollm3bv2-large-lsj train-sca-ablat-lsj-scale_lr-110423
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0" -t msrresrchvc -w msrresrchws --sku=G8-V100 --no-pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-1x8-v100-16g-no_pre.gpt2-large-lsj train-sca-ablat-lsj-scale_lr-110423
+# sing octo 4x8 no-pre lsj
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr train-sca-ablat-lsj-scale_lr-110423
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0  training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr train-sca-ablat-lsj-scale_lr-110423
+# The maximum scale lr with BS 64: 8e-4 (too big to achieve better)
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0  training.learning_rate=8e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.resrch-16x4-v100-16g-pre.ollm3bv2-large-lsj-1xlr train-sca-ablat-lsj-scale_lr-110423
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0  training.learning_rate=8e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-16x4-v100-16g-no_pre.gpt2-large-lsj-1xlr train-sca-ablat-lsj-scale_lr-110423
+# The maximum scale lr with BS 64: 4e-4 (try to achieve better with that from BS 32)
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0  training.learning_rate=4e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.resrch-16x4-v100-16g-pre.ollm3bv2-large-lsj-1xlr-4e_4 train-sca-ablat-lsj-scale_lr-110423
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0  training.learning_rate=4e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-16x4-v100-16g-no_pre.gpt2-large-lsj-1xlr-4e_4 train-sca-ablat-lsj-scale_lr-110423
+# 1x8, 4e-4
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t itplabrr1cl1 -w resrchvc --sku=G8-V100 --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.rr1-1x8-v100-16g-pre.ollm3bv2-large-lsj-4e_4 train-sca-ablat-lsj-scale_lr-110423
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t itplabrr1cl1 -w resrchvc --sku=G8-V100 --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.rr1-1x8-v100-16g-pre.gpt2-large-lsj-4e_4 train-sca-ablat-lsj-scale_lr-110423
+# The maximum scale lr with BS 64: 4e-4 (try to achieve better with that from BS 32)
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0  training.learning_rate=4e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.resrch-16x4-v100-16g-pre.ollm3bv2-large-lsj-1xlr-4e_4 train-sca-ablat-lsj-scale_lr-110423
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0  training.learning_rate=4e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-16x4-v100-16g-no_pre.gpt2-large-lsj-1xlr-4e_4 train-sca-ablat-lsj-scale_lr-110423

amlt_configs/train-sca-ablat-model_arch-103123.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+env_defaults:
+  # NOTE: this kind of string leaded by > will append a new line to the end of the string
+  SHARED_CMD_ARGS: >-
+    -m src.train
+    training.do_train=True
+    training.do_eval=True
+    training.do_inference=True
+    training.max_eval_samples=800
+    training.max_steps=200000
+    training.fp16=True
+    training.output_dir=$AMLT_OUTPUT_DIR
+    training.output_log_dir=$AMLT_LOGS_DIR
+    training.save_strategy=steps
+    training.save_steps=5000
+    training.save_total_limit=3
+    training.optim=adamw_torch
+    training.evaluate_before_train=True
+    training.per_device_train_batch_size=1
+    training.evaluation_strategy=steps
+    training.eval_steps=5000
+    training.logging_steps=1000
+    training.logging_first_step=True
+    training.dataloader_num_workers=4
+    training.num_masks_per_sample=16
+    training.lr_scheduler_type=cosine
+    training.learning_rate=1e-4
+    training.weight_decay=1e-4
+    training.warmup_steps=200
+    training.warmup_ratio=0.33333333
+    training.compute_metrics=True
+    wandb.project=$AMLT_EXPERIMENT_NAME
+    wandb.name=$AMLT_JOB_NAME
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+    model.num_task_tokens=6
+    model.lm_head_model_name_or_path=gpt2-large
+environment:
+  image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:23.06-py3  # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:22.12-py3  # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+jobs:
+  - name: gpt2-large
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-local]'
+        eval_data='[vg-densecap-local]'
+        model.lm_head_model_name_or_path=gpt2-large
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+      container_args:
+        shm_size: 256g
+  - name: open_llama_3b_v2
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-local]'
+        eval_data='[vg-densecap-local]'
+        model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
+        training.gradient_checkpointing=true
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+      container_args:
+        shm_size: 256g
+# sing resrch 1x8
+# amlt run amlt_configs/train-sca-ablat-model_arch-103123.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.gpt2-large-mtv2 train-sca-ablat-model_arch-103123 -d "" --extra-args "+model=base_sca_multitask_v2 model.num_caption_tokens=8 model.additional_num_hidden_layers=12" -t msroctovc -w msroctows --sku=G8-V100 --no-pre
+# amlt run amlt_configs/train-sca-ablat-model_arch-103123.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.gpt2-large-sm train-sca-ablat-model_arch-103123 -d "" --extra-args "+model=base_sca_multitask_split_mixer model.num_caption_tokens=8 model.additional_num_hidden_layers=12" -t msroctovc -w msroctows --sku=G8-V100 --no-pre
+# sing octo 1x8 v100 16g
+# amlt run amlt_configs/train-sca-ablat-model_arch-103123.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.gpt2-large-ddv2 train-sca-ablat-model_arch-103123 -d "" --extra-args "+model=base_sca_direct_decoding_v2 model.additional_num_hidden_layers=12" -t msroctovc -w msroctows --sku=G8-V100 --no-pre
+# amlt run -d "" --extra-args "+model=base_sca_multitask_roi_pool" -t msrresrchvc -w msrresrchws --sku=G8-V100 --no-pre amlt_configs/train-sca-ablat-model_arch-103123.yaml :0=`date +"%m%d%y"`.resrch-1x8-v100-16g-no_pre.gpt2-large-roi_pool train-sca-ablat-model_arch-103123
+# amlt run -d "" --extra-args "+model=base_sca_multitask_roi_pool model.vl_projector_type=mlp" -t msrresrchvc -w msrresrchws --sku=G8-V100 --no-pre amlt_configs/train-sca-ablat-model_arch-103123.yaml :0=`date +"%m%d%y"`.resrch-1x8-v100-16g-no_pre.gpt2-large-roi_pool-mlp train-sca-ablat-model_arch-103123
+# Caveat:
+# 1. cannot add two "+model"
+# 2. base_sca_direct_decoding_v2 cannot add `num_caption_tokens`

amlt_configs/train-sca-ablat-sam_size-110423.yaml ADDED Viewed

	@@ -0,0 +1,108 @@

+env_defaults:
+  # NOTE: this kind of string leaded by > will append a new line to the end of the string
+  SHARED_CMD_ARGS: >-
+    -m src.train
+    +model=base_sca_multitask_v2
+    training.do_train=True
+    training.do_eval=True
+    training.do_inference=True
+    training.max_eval_samples=800
+    training.max_steps=200000
+    training.fp16=True
+    training.output_dir=$AMLT_OUTPUT_DIR
+    training.output_log_dir=$AMLT_LOGS_DIR
+    training.save_strategy=steps
+    training.save_steps=5000
+    training.save_total_limit=3
+    training.optim=adamw_torch
+    training.evaluate_before_train=True
+    training.per_device_train_batch_size=1
+    training.evaluation_strategy=steps
+    training.eval_steps=5000
+    training.logging_steps=1000
+    training.logging_first_step=True
+    training.dataloader_num_workers=4
+    training.num_masks_per_sample=16
+    training.lr_scheduler_type=cosine
+    training.learning_rate=1e-4
+    training.weight_decay=1e-4
+    training.warmup_steps=200
+    training.warmup_ratio=0.33333333
+    training.compute_metrics=True
+    wandb.project=$AMLT_EXPERIMENT_NAME
+    wandb.name=$AMLT_JOB_NAME
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+    model.additional_num_hidden_layers=12
+    model.num_task_tokens=6
+    model.lm_head_model_name_or_path=gpt2-large
+    model.num_caption_tokens=8
+environment:
+  image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:23.06-py3  # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:22.12-py3  # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+jobs:
+  - name: gpt2-large
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-local]'
+        eval_data='[vg-densecap-local]'
+        model.lm_head_model_name_or_path=gpt2-large
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+      container_args:
+        shm_size: 256g
+  - name: open_llama_3b_v2
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-local]'
+        eval_data='[vg-densecap-local]'
+        model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
+        training.gradient_checkpointing=true
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+      container_args:
+        shm_size: 256g
+# sing resrch 1x8
+# amlt run amlt_configs/train-sca-ablat-sam_size-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.sam-vit-huge train-sca-ablat-sam_size-110423 -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-huge" -t msroctovc -w msroctows --sku=G8-V100 --no-pre -y
+# amlt run amlt_configs/train-sca-ablat-sam_size-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.sam-vit-large train-sca-ablat-sam_size-110423 -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-large" -t msroctovc -w msroctows --sku=G8-V100 --no-pre -y
+# amlt run amlt_configs/train-sca-ablat-sam_size-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.sam-vit-base train-sca-ablat-sam_size-110423 -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-base" -t msroctovc -w msroctows --sku=G8-V100 --no-pre -y

amlt_configs/train-sca-ablat-timm.yaml ADDED Viewed

	@@ -0,0 +1,143 @@

+env_defaults:
+  # NOTE: this kind of string leaded by > will append a new line to the end of the string
+  # Use base_sca_multitask_v2
+  # training.lr_scheduler_type=constant
+  SHARED_CMD_ARGS: >-
+    -m src.train
+    +model=base_sca_timm_multitask_v2
+    training.do_train=True
+    training.do_eval=True
+    training.do_inference=True
+    +data.streaming=False
+    training.max_eval_samples=800
+    training.max_steps=200000
+    training.fp16=True
+    training.output_dir=$AMLT_OUTPUT_DIR
+    training.output_log_dir=$AMLT_LOGS_DIR
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+    training.save_strategy=steps
+    training.save_steps=5000
+    training.save_total_limit=3
+    training.optim=adamw_torch
+    training.evaluate_before_train=True
+    training.per_device_train_batch_size=1
+    training.evaluation_strategy=steps
+    training.eval_steps=5000
+    training.logging_steps=1000
+    training.logging_first_step=True
+    training.dataloader_num_workers=4
+    training.num_masks_per_sample=16
+    wandb.project=$AMLT_EXPERIMENT_NAME
+    wandb.name=$AMLT_JOB_NAME
+    model.num_caption_tokens=8
+    model.additional_num_hidden_layers=12
+    model.num_task_tokens=6
+    training.lr_scheduler_type=cosine
+    model.lm_head_model_name_or_path=gpt2-large
+    training.learning_rate=1e-4
+    training.weight_decay=1e-4
+    training.warmup_steps=200
+    training.warmup_ratio=0.33333333
+    training.compute_metrics=True
+environment:
+  image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:23.06-py3  # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:22.12-py3  # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+jobs:
+  - name: vit_base_patch32_clip_224.openai
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-local]'
+        eval_data='[vg-densecap-local]'
+        model.sam_model_name_or_path=facebook/sam-vit-base
+        model.timm_vision_name=vit_base_patch32_clip_224.openai
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+        # NCCL_IB_DISABLE: 1
+        # NCCL_IBEXT_DISABLE: 1
+      container_args:
+        shm_size: 256g
+# TIMM_NAME_LS=(
+#   vit_large_patch14_clip_336.openai
+#   vit_large_patch14_clip_224.datacompxl
+#   eva02_large_patch14_clip_336.merged2b
+# )
+# for TIMM_NAME in ${TIMM_NAME_LS[@]}; do
+#   amlt run -d ""  --extra-args "model.sam_model_name_or_path=facebook/sam-vit-$(echo $TIMM_NAME | cut -d '_' -f2) model.timm_vision_name=$TIMM_NAME" \
+#   -t itplabrr1cl1 -w resrchvc --no-pre \
+#   amlt_configs/train-sca-ablat-timm.yaml \
+#   :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.$TIMM_NAME \
+#   112323.train-sca-ablat-timm
+# done
+# TIMM_NAME_LS=(
+#   vit_large_patch14_clip_336.openai
+#   vit_large_patch14_clip_224.datacompxl
+#   eva02_large_patch14_clip_336.merged2b
+# )
+# for TIMM_NAME in ${TIMM_NAME_LS[@]}; do
+#   amlt run -d ""  --extra-args "model.sam_model_name_or_path=facebook/sam-vit-$(echo $TIMM_NAME | cut -d '_' -f2) model.timm_vision_name=$TIMM_NAME training.trainable_params='[mask_decoder.additional_transformer,mask_decoder.caption_tokens,task_tokens,language_project,neck,mask_decoder.transformer]'" \
+#   -t itplabrr1cl1 -w resrchvc --no-pre \
+#   amlt_configs/train-sca-ablat-timm.yaml \
+#   :0=`date +"%m%d%y"`.rr1-1x8-v100-16g-no_pre.$TIMM_NAME-tune_sam_xformer \
+#   112323.train-sca-ablat-timm
+# done
+# TIMM_NAME_LS=(
+#   vit_large_patch16_224.mae
+#   vit_large_patch14_reg4_dinov2.lvd142m
+# )
+# for TIMM_NAME in ${TIMM_NAME_LS[@]}; do
+#   amlt run -d ""  --extra-args "model.sam_model_name_or_path=facebook/sam-vit-$(echo $TIMM_NAME | cut -d '_' -f2) model.timm_vision_name=$TIMM_NAME training.trainable_params='[mask_decoder.additional_transformer,mask_decoder.caption_tokens,task_tokens,language_project,neck,mask_decoder.transformer]'" \
+#   -t itplabrr1cl1 -w resrchvc --no-pre \
+#   amlt_configs/train-sca-ablat-timm.yaml \
+#   :0=`date +"%m%d%y"`.rr1-1x8-v100-16g-no_pre.$TIMM_NAME-tune_sam_xformer \
+#   112323.train-sca-ablat-timm
+# done
+# TIMM_NAME_LS=(
+#   vit_large_patch14_reg4_dinov2.lvd142m
+# )
+# for TIMM_NAME in ${TIMM_NAME_LS[@]}; do
+#   amlt run -d ""  --extra-args "model.sam_model_name_or_path=facebook/sam-vit-$(echo $TIMM_NAME | cut -d '_' -f2) model.timm_vision_name=$TIMM_NAME training.trainable_params='[mask_decoder.additional_transformer,mask_decoder.caption_tokens,task_tokens,language_project,neck,mask_decoder.transformer]'" \
+#   -t msroctovc -w msroctows --no-pre \
+#   amlt_configs/train-sca-ablat-timm.yaml \
+#   :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.$TIMM_NAME-tune_sam_xformer \
+#   112323.train-sca-ablat-timm
+# done
+# TIMM_NAME_LS=(
+#   eva02_large_patch14_clip_336.merged2b
+# )
+# for TIMM_NAME in ${TIMM_NAME_LS[@]}; do
+#   amlt run -d ""  --extra-args "model.sam_model_name_or_path=facebook/sam-vit-$(echo $TIMM_NAME | cut -d '_' -f2) model.timm_vision_name=$TIMM_NAME" \
+#   -t msroctovc -w msroctows --no-pre \
+#   amlt_configs/train-sca-ablat-timm.yaml \
+#   :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.$TIMM_NAME \
+#   112323.train-sca-ablat-timm
+# done

amlt_configs/train-sca-ablat-weak_sup_data.yaml ADDED Viewed

	@@ -0,0 +1,327 @@

+env_defaults:
+  # NOTE: this kind of string leaded by > will append a new line to the end of the string
+  # Use base_sca_multitask_v2
+  # training.lr_scheduler_type=constant
+  SHARED_CMD_ARGS: >-
+    -m src.train
+    +model=base_sca_multitask_v2
+    training.do_train=True
+    training.do_eval=True
+    training.do_inference=True
+    +data.streaming=False
+    training.max_eval_samples=800
+    training.max_steps=200000
+    training.fp16=True
+    training.output_dir=$AMLT_OUTPUT_DIR
+    training.output_log_dir=$AMLT_LOGS_DIR
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+    training.save_strategy=steps
+    training.save_steps=5000
+    training.save_total_limit=3
+    training.optim=adamw_torch
+    training.evaluate_before_train=True
+    training.per_device_train_batch_size=1
+    training.evaluation_strategy=steps
+    training.eval_steps=5000
+    training.logging_steps=1000
+    training.logging_first_step=True
+    training.dataloader_num_workers=4
+    training.num_masks_per_sample=16
+    wandb.project=$AMLT_EXPERIMENT_NAME
+    wandb.name=$AMLT_JOB_NAME
+    model.num_caption_tokens=8
+    model.additional_num_hidden_layers=12
+    model.num_task_tokens=6
+    training.lr_scheduler_type=cosine
+    model.lm_head_model_name_or_path=gpt2-large
+    training.learning_rate=1e-4
+    training.weight_decay=1e-4
+    training.warmup_steps=200
+    training.warmup_ratio=0.33333333
+    training.compute_metrics=True
+environment:
+  image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:23.06-py3  # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:22.12-py3  # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+jobs:
+  - name: only-vg
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-local]'
+        eval_data='[vg-densecap-local]'
+        training.max_steps=100000
+        training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
+        wandb.name=$$AMLT_JOB_NAME-vg
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+        # NCCL_IB_DISABLE: 1
+        # NCCL_IBEXT_DISABLE: 1
+      container_args:
+        shm_size: 256g
+  - name: first-coco-then-vg
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[coco-instance-task_type_caption-local]'
+        eval_data='[coco-instance-task_type_caption-local]'
+        training.max_steps=100000
+        training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
+        wandb.name=$$AMLT_JOB_NAME-coco
+        $EXTRA_ARGS
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-local]'
+        eval_data='[vg-densecap-local]'
+        training.max_steps=100000
+        training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
+        wandb.name=$$AMLT_JOB_NAME-vg
+        model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+        # NCCL_IB_DISABLE: 1
+        # NCCL_IBEXT_DISABLE: 1
+      container_args:
+        shm_size: 256g
+  - name: first-v3det-task_type_caption-local-then-vg
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        $EXTRA_ARGS
+        train_data='[v3det-task_type_caption-local]'
+        eval_data='[coco-instance-task_type_caption-local]'
+        training.max_steps=100000
+        training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
+        wandb.name=$$AMLT_JOB_NAME-v3det
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-local]'
+        eval_data='[vg-densecap-local]'
+        training.max_steps=100000
+        training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
+        wandb.name=$$AMLT_JOB_NAME-vg
+        model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+        # NCCL_IB_DISABLE: 1
+        # NCCL_IBEXT_DISABLE: 1
+      container_args:
+        shm_size: 256g
+  - name: first-objects365-then-vg
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[objects365-task_type_caption-local]'
+        eval_data='[coco-instance-task_type_caption-local]'
+        training.max_steps=100000
+        training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
+        wandb.name=$$AMLT_JOB_NAME-objects365
+        $EXTRA_ARGS
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-local]'
+        eval_data='[vg-densecap-local]'
+        training.max_steps=100000
+        training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
+        wandb.name=$$AMLT_JOB_NAME-vg
+        model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+        # NCCL_IB_DISABLE: 1
+        # NCCL_IBEXT_DISABLE: 1
+      container_args:
+        shm_size: 256g
+  - name: first-coco-v3det-task_type_caption-local-then-vg
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        $EXTRA_ARGS
+        train_data='[coco-instance-task_type_caption-local,v3det-task_type_caption-local]'
+        train_data_interleave_probabilities='[117266,183348]'
+        eval_data='[coco-instance-task_type_caption-local]'
+        training.max_steps=100000
+        training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
+        wandb.name=$$AMLT_JOB_NAME-v3det
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-local]'
+        eval_data='[vg-densecap-local]'
+        training.max_steps=100000
+        training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
+        wandb.name=$$AMLT_JOB_NAME-vg
+        model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+        # NCCL_IB_DISABLE: 1
+        # NCCL_IBEXT_DISABLE: 1
+      container_args:
+        shm_size: 256g
+  - name: first-coco-v3det-objects365-then-vg
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        $EXTRA_ARGS
+        train_data='[coco-instance-task_type_caption-local,v3det-task_type_caption-local,objects365-task_type_caption-local]'
+        train_data_interleave_probabilities='[117266,183348,1742289]'
+        eval_data='[coco-instance-task_type_caption-local]'
+        training.max_steps=100000
+        training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
+        wandb.name=$$AMLT_JOB_NAME-v3det
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-local]'
+        eval_data='[vg-densecap-local]'
+        training.max_steps=100000
+        training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
+        wandb.name=$$AMLT_JOB_NAME-vg
+        model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+        # NCCL_IB_DISABLE: 1
+        # NCCL_IBEXT_DISABLE: 1
+      container_args:
+        shm_size: 256g
+  - name: first-coco-objects365-then-vg
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        $EXTRA_ARGS
+        train_data='[coco-instance-task_type_caption-local,objects365-task_type_caption-local]'
+        train_data_interleave_probabilities='[117266,1742289]'
+        eval_data='[coco-instance-task_type_caption-local]'
+        training.max_steps=100000
+        training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
+        wandb.name=$$AMLT_JOB_NAME-v3det
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-local]'
+        eval_data='[vg-densecap-local]'
+        training.max_steps=100000
+        training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
+        wandb.name=$$AMLT_JOB_NAME-vg
+        model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+        # NCCL_IB_DISABLE: 1
+        # NCCL_IBEXT_DISABLE: 1
+      container_args:
+        shm_size: 256g
+# sing clusters, both octo and resrch failed
+# amlt run -d ""  \
+# -t msroctovc -w msroctows --no-pre \
+# amlt_configs/train-sca-ablat-weak_sup_data.yaml \
+# 112123.train-sca-ablat-weak_sup_data.octo
+# sing clusters, both octo and resrch failed
+# amlt run -d ""  \
+# -t msrresrchvc -w msrresrchws --no-pre \
+# amlt_configs/train-sca-ablat-weak_sup_data.yaml \
+# 112123.train-sca-ablat-weak_sup_data.resrch
+# amlt run -d ""  \
+# -t itplabrr1cl1 -w resrchvc --no-pre \
+# amlt_configs/train-sca-ablat-weak_sup_data.yaml \
+# 112123.train-sca-ablat-weak_sup_data.rr1
+# amlt run -d ""  \
+# -t msroctovc -w msroctows --no-pre \
+# amlt_configs/train-sca-ablat-weak_sup_data.yaml :first-coco-objects365-then-vg \
+# 112123.train-sca-ablat-weak_sup_data.rr1

amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml ADDED Viewed

	@@ -0,0 +1,178 @@

+env_defaults:
+  # NOTE: this kind of string leaded by > will append a new line to the end of the string
+  SHARED_CMD_ARGS: >-
+    -m src.train
+    +model=base_sca_multitask_v2
+    training.do_train=True
+    training.do_eval=True
+    training.do_inference=True
+    +data.streaming=False
+    training.max_eval_samples=800
+    training.max_steps=200000
+    training.fp16=True
+    training.output_dir=$AMLT_OUTPUT_DIR
+    training.output_log_dir=$AMLT_LOGS_DIR
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+    training.save_strategy=steps
+    training.save_steps=5000
+    training.save_total_limit=3
+    training.optim=adamw_torch
+    training.evaluate_before_train=True
+    training.per_device_train_batch_size=1
+    training.evaluation_strategy=steps
+    training.eval_steps=5000
+    training.logging_steps=1000
+    training.logging_first_step=True
+    training.dataloader_num_workers=4
+    training.num_masks_per_sample=16
+    wandb.project=$AMLT_EXPERIMENT_NAME
+    wandb.name=$AMLT_JOB_NAME
+    model.num_caption_tokens=8
+    model.additional_num_hidden_layers=12
+    model.num_task_tokens=6
+    training.lr_scheduler_type=cosine
+    model.lm_head_model_name_or_path=gpt2-large
+    training.learning_rate=1e-4
+    training.weight_decay=1e-4
+    training.warmup_steps=200
+    training.warmup_ratio=0.33333333
+    training.compute_metrics=True
+environment:
+  image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:23.06-py3  # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:22.12-py3  # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+jobs:
+  - name: gpt2-large
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[objects365-task_type_caption-local]'
+        eval_data='[objects365-task_type_caption-local]'
+        model.lm_head_model_name_or_path=gpt2-large
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+        # NCCL_IB_DISABLE: 1
+        # NCCL_IBEXT_DISABLE: 1
+      container_args:
+        shm_size: 256g
+  - name: open_llama_3b_v2
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[objects365-task_type_caption-local]'
+        eval_data='[objects365-task_type_caption-local]'
+        model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
+        training.gradient_checkpointing=true
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+        # NCCL_IB_DISABLE: 1
+        # NCCL_IBEXT_DISABLE: 1
+      container_args:
+        shm_size: 256g
+# sing octo 4x8 no-pre lsj
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml :1=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr train-sca-pretrain-o365-lsj-scale_lr-110923
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0  training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml :0=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr train-sca-pretrain-o365-lsj-scale_lr-110923
+# amlt run -d "" --extra-args "training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml :1=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.ollm3bv2-large-no_lsj-1xlr train-sca-pretrain-o365-lsj-scale_lr-110923
+# amlt run -d "" --extra-args "training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml :0=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.gpt2-large-no_lsj-1xlr train-sca-pretrain-o365-lsj-scale_lr-110923
+# 4x8(x2)
+# amlt run -d "" --extra-args "training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
+# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
+# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
+# :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.ollm3bv2-large-no_lsj-1xlr-bs_2 \
+# train-sca-pretrain-o365-lsj-scale_lr-110923
+# amlt run -d "" --extra-args "training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
+# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
+# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
+# :0=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.gpt2-large-no_lsj-1xlr-bs_2 \
+# train-sca-pretrain-o365-lsj-scale_lr-110923
+# 4x8(x2), 1e-4
+# amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2" \
+# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
+# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
+# :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.ollm3bv2-large-no_lsj-0xlr-bs_64 \
+# train-sca-pretrain-o365-lsj-scale_lr-110923
+# resume above due to node 4006 failed
+# amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2 training.resume_from_checkpoint=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7299942105.68600-95f56dfa-4b13-45bc-8d03-aad354819319/checkpoint-45000" \
+# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
+# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
+# :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.ollm3bv2-large-no_lsj-0xlr-bs_64.resume \
+# train-sca-pretrain-o365-lsj-scale_lr-110923
+# amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2 /mnt/blob/projects/sca-xiaoke-v3/amlt-results/7299935921.15305-a115d837-dada-4074-b41d-f66e1b187cc1/checkpoint-60000" \
+# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
+# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
+# :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.ollm3bv2-large-no_lsj-0xlr-bs_64.resume.2 \
+# train-sca-pretrain-o365-lsj-scale_lr-110923
+# amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2" \
+# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
+# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
+# :0=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.gpt2-large-no_lsj-0xlr-bs_64 \
+# train-sca-pretrain-o365-lsj-scale_lr-110923
+# amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=1" \
+# -t msroctovc -w msroctows --sku=8xG8-V100 --no-pre \
+# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
+# :0=`date +"%m%d%y"`.octo-8x8-v100-32g-pre.gpt2-large-no_lsj-0xlr-bs_64 \
+# train-sca-pretrain-o365-lsj-scale_lr-110923
+# resume above due to node 4006 failed
+# amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2" \
+# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
+# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
+# :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre-no_ib.ollm3bv2-large-no_lsj-0xlr-bs_64.rerun \
+# train-sca-pretrain-o365-lsj-scale_lr-110923
+# amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2" \
+# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
+# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
+# :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre-_ib.ollm3bv2-large-no_lsj-0xlr-bs_64.rerun \
+# train-sca-pretrain-o365-lsj-scale_lr-110923

amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml ADDED Viewed

	@@ -0,0 +1,196 @@

+env_defaults:
+  # NOTE: this kind of string leaded by > will append a new line to the end of the string
+  SHARED_CMD_ARGS: >-
+    -m src.train
+    +model=base_sca_multitask_v2
+    training.do_train=True
+    training.do_eval=True
+    training.do_inference=True
+    +data.streaming=False
+    training.max_eval_samples=800
+    training.max_steps=100000
+    training.fp16=True
+    training.output_dir=$AMLT_OUTPUT_DIR
+    training.output_log_dir=$AMLT_LOGS_DIR
+    model.cache_dir=/mnt/blob/weights/.model.cache/
+    training.save_strategy=steps
+    training.save_steps=5000
+    training.save_total_limit=3
+    training.optim=adamw_torch
+    training.evaluate_before_train=True
+    training.per_device_train_batch_size=1
+    training.evaluation_strategy=steps
+    training.eval_steps=5000
+    training.logging_steps=1000
+    training.logging_first_step=True
+    training.dataloader_num_workers=4
+    training.num_masks_per_sample=16
+    wandb.project=$AMLT_EXPERIMENT_NAME
+    wandb.name=$AMLT_JOB_NAME
+    model.num_caption_tokens=8
+    model.additional_num_hidden_layers=12
+    model.num_task_tokens=6
+    training.lr_scheduler_type=cosine
+    model.lm_head_model_name_or_path=gpt2-large
+    training.learning_rate=1e-4
+    training.weight_decay=1e-4
+    training.warmup_steps=200
+    training.warmup_ratio=0.33333333
+    training.compute_metrics=True
+environment:
+  image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:23.06-py3  # NCCL on PHLRR4076 cannot initialized successfully
+  # image: nvidia/pytorch:22.12-py3  # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
+  registry: nvcr.io
+code:
+  local_dir: $CONFIG_DIR/../
+jobs:
+  - name: gpt2-large
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}-V100-IB
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-region_descriptions]'
+        eval_data='[vg-densecap-region_descriptions]'
+        model.lm_head_model_name_or_path=gpt2-large
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+        # NCCL_IB_DISABLE: 1
+        # NCCL_IBEXT_DISABLE: 1
+      container_args:
+        shm_size: 256g
+  - name: open_llama_3b_v2
+    preemptible: True
+    sku: ${NUM_NODES}xG${NUM_GPUS}-V100-IB
+    process_count_per_node: 1 # Each node should run 1 process
+    command:
+      - . amlt_configs/setup.sh
+      - source ~/.bashrc
+      - . amlt_configs/setup_accelerate_on_azure.sh
+      - >-
+        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
+        $SHARED_CMD_ARGS
+        train_data='[vg-densecap-region_descriptions]'
+        eval_data='[vg-densecap-region_descriptions]'
+        model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
+        training.gradient_checkpointing=true
+        $EXTRA_ARGS
+    submit_args:
+      env:
+        SHARED_MEMORY_PERCENT: 0.5
+        HYDRA_FULL_ERROR: 1
+        # NCCL_IB_DISABLE: 1
+        # NCCL_IBEXT_DISABLE: 1
+      container_args:
+        shm_size: 256g
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr/checkpoint-100000/ training.max_steps=100000 training.learning_rate=4e-4" \
+# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4" \
+# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :1=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.fintune-ollmv2-lr_1e_4-1xlr-lsj train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr/checkpoint-100000/ training.max_steps=100000 training.learning_rate=4e-4" \
+# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.resrch-4x8-v100-16g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4" \
+# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :1=`date +"%m%d%y"`.resrch-4x8-v100-16g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr/checkpoint-100000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
+# -t itphcrdellcl1 --vc hcrdell1 --sku=5xG4-V100 --no-pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.dell-5x4-v100-32g-no_pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_2 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
+# -t itphcrdellcl1 --vc hcrdell1 --sku=5xG4-V100 --no-pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :1=`date +"%m%d%y"`.dell-5x4-v100-32g-no_pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_2 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr/checkpoint-100000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
+# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_2 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
+# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :1=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_2 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000  training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
+# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_2-o365_1e_4_no_lsj_bs_64 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000  training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
+# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_2-o365_1e_4_no_lsj_bs_64 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111323.backup_ckpts.pretrain-o365-no_lsj-bs_64/train-sca-pretrain-o365-lsj-scale_lr-110923/111223.rr1-4x8-v100-32g-pre.gpt2-large-no_lsj-1xlr-bs_2/checkpoint-100000  training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
+# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111323.backup_ckpts.pretrain-o365-no_lsj-bs_64/train-sca-pretrain-o365-lsj-scale_lr-110923/111223.rr1-4x8-v100-32g-pre.ollm3bv2-large-no_lsj-1xlr-bs_2/checkpoint-100000  training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
+# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :1=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228594.52312-e52ba12d-9e32-41d4-9630-e8c5d3e47ca0/checkpoint-200000/  training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
+# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
+# :1=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k \
+# train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228580.84789-0b1216d8-79dc-46b3-8ef2-57c112e1bd18/checkpoint-200000/  training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
+# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
+# :0=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k \
+# train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# The o365 ollm3bv2 failed due to devices. try different clusters
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228594.52312-e52ba12d-9e32-41d4-9630-e8c5d3e47ca0/checkpoint-200000/  training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=1" \
+# -t msrresrchvc -w msrresrchws --sku=8xG4-V100-IB --pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
+# :1=`date +"%m%d%y"`.resrch-8x4-v100-16g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k \
+# train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228594.52312-e52ba12d-9e32-41d4-9630-e8c5d3e47ca0/checkpoint-200000/  training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=1" \
+# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
+# :1=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k \
+# train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228594.52312-e52ba12d-9e32-41d4-9630-e8c5d3e47ca0/checkpoint-200000/  training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=1" \
+# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
+# :1=`date +"%m%d%y"`.resrch-4x8-v100-16g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k \
+# train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
+# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228594.52312-e52ba12d-9e32-41d4-9630-e8c5d3e47ca0/checkpoint-200000/  training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
+# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
+# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
+# :1=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k.2 \
+# train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023