xingzhikb commited on
Commit
002bd9b
·
0 Parent(s):
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .amltignore +17 -0
  2. .gitattributes +2 -0
  3. .gitignore +161 -0
  4. LICENSE +201 -0
  5. README.md +74 -0
  6. amlt_configs/accelerate_config.yaml +18 -0
  7. amlt_configs/accelerate_deepspeed_config.local.yaml +26 -0
  8. amlt_configs/accelerate_deepspeed_config.yaml +26 -0
  9. amlt_configs/debug-data_mount.yaml +52 -0
  10. amlt_configs/debug-sca.yaml +65 -0
  11. amlt_configs/debug.yaml +51 -0
  12. amlt_configs/infer-sam_captioner-region_chunkify-eval_suite.yaml +69 -0
  13. amlt_configs/infer-sca-eval_suite-ckpt.yaml +133 -0
  14. amlt_configs/infer-sca-eval_suite-coco_instance_task_type_caption-last_model.yaml +95 -0
  15. amlt_configs/infer-sca-eval_suite-vg-best_model.yaml +96 -0
  16. amlt_configs/infer-sca-eval_suite-vg-last_model.yaml +96 -0
  17. amlt_configs/post_process.sh +2 -0
  18. amlt_configs/setup.sh +144 -0
  19. amlt_configs/setup_accelerate_on_azure.sh +53 -0
  20. amlt_configs/setup_eval_suite.sh +28 -0
  21. amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml +126 -0
  22. amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml +128 -0
  23. amlt_configs/train-sca-ablat-model_arch-103123.yaml +112 -0
  24. amlt_configs/train-sca-ablat-sam_size-110423.yaml +108 -0
  25. amlt_configs/train-sca-ablat-timm.yaml +143 -0
  26. amlt_configs/train-sca-ablat-weak_sup_data.yaml +327 -0
  27. amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml +178 -0
  28. amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml +196 -0
  29. data/demo_cases/1000_IM-0003-1001.dcm.png +3 -0
  30. data/demo_cases/1000_IM-0003-2001.dcm.png +3 -0
  31. data/demo_cases/1000_IM-0003-3001.dcm.png +3 -0
  32. data/demo_cases/1001_IM-0004-1001.dcm.png +3 -0
  33. data/demo_cases/1001_IM-0004-1002.dcm.png +3 -0
  34. data/demo_cases/1002_IM-0004-1001.dcm.png +3 -0
  35. data/demo_cases/1002_IM-0004-2001.dcm.png +3 -0
  36. data/demo_cases/1003_IM-0005-2002.dcm.png +3 -0
  37. data/demo_cases/1004_IM-0005-1001.dcm.png +3 -0
  38. data/demo_cases/1004_IM-0005-2001.dcm.png +3 -0
  39. data/demo_cases/1005_IM-0006-1001.dcm.png +3 -0
  40. data/demo_cases/1005_IM-0006-3003.dcm.png +3 -0
  41. data/demo_cases/1006_IM-0007-1001.dcm.png +3 -0
  42. data/demo_cases/1006_IM-0007-3003.dcm.png +3 -0
  43. data/demo_cases/1007_IM-0008-1001.dcm.png +3 -0
  44. data/demo_cases/1007_IM-0008-2001.dcm.png +3 -0
  45. data/demo_cases/1007_IM-0008-3001.dcm.png +3 -0
  46. data/demo_cases/1008_IM-0009-2001.dcm.png +3 -0
  47. data/demo_cases/1008_IM-0009-4004.dcm.png +3 -0
  48. data/demo_cases/1009_IM-0010-1001.dcm.png +3 -0
  49. data/demo_cases/1009_IM-0010-2001.dcm.png +3 -0
  50. data/demo_cases/100_IM-0002-1001.dcm.png +3 -0
.amltignore ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /exp*
2
+ /tmp
3
+
4
+ /.mypy_cache
5
+ __pycache__/
6
+ /build
7
+ /data
8
+ /results
9
+ *.egg-info/
10
+ scripts/examples/
11
+ .amltconfig
12
+ /amlt
13
+ .*cache/
14
+ wandb/
15
+
16
+ build/
17
+ *.egg-info/
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ data/**/* filter=lfs diff=lfs merge=lfs -text
2
+ Medical-SAM2/data/** filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ wandb/
132
+
133
+ # VSCode
134
+ .vscode/*
135
+ # !.vscode/settings.json
136
+ # !.vscode/tasks.json
137
+ # !.vscode/launch.json
138
+ # !.vscode/extensions.json
139
+
140
+ # Hydra
141
+ .hydra
142
+ multirun.yaml
143
+ .submitit
144
+
145
+ # These should be symlinked.
146
+ exp
147
+ .*cache/
148
+ /tmp
149
+
150
+ # Download data manually.
151
+ data/all_instances_82K.jsonl
152
+ data/alpaca_data.json
153
+ data/user_oriented_instructions.jsonl
154
+
155
+ # Ignore amlt files
156
+ .amltconfig
157
+ /amlt
158
+
159
+ # Ignore slurm files
160
+ **/*slurm*/**
161
+ *.slurm
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Segment and Caption Anything
2
+
3
+ The repository contains the official implementation of "Segment and Caption Anything"
4
+
5
+ [Project Page](https://xk-huang.github.io/segment-caption-anything), [Paper](https://arxiv.org/abs/2312.00869)
6
+
7
+ ![teaser](./docs/teaser-github.svg)
8
+
9
+ tl;dr
10
+ 1. Despite the absence of semantic labels in the training data, SAM implies high-level semantics sufficient for captioning.
11
+ 2. SCA (b) is a lightweight augmentation of SAM (a) with the ability to generate regional captions.
12
+ 3. On top of SAM architecture, we add a fixed pre-trained language mode, and a optimizable lightweight hybrid feature mixture whose training is cheap and scalable.
13
+
14
+ <table>
15
+ <tr>
16
+ <td><img src="./docs/anything-mode-00.png.jpg" alt="anything-mode-00"></td>
17
+ <td><img src="./docs/anything-mode-03.png.jpg" alt="anything-mode-01"></td>
18
+ </tr>
19
+ <tr>
20
+ <td><img src="./docs/anything-mode-01.png.jpg" alt="anything-mode-02"></td>
21
+ <td><img src="./docs/anything-mode-02.png.jpg" alt="anything-mode-03"></td>
22
+ </tr>
23
+ </table>
24
+
25
+ News
26
+
27
+ - [01/31/2024] Update the [paper](https://xk-huang.github.io/segment-caption-anything/files/segment-caption-anything.013124.pdf) and the [supp](https://xk-huang.github.io/segment-caption-anything/files/segment-caption-anything-supp.013124.pdf). Release code v0.0.2: bump transformers to 4.36.2, support mistral series, phi-2, zephyr; add experiments about SAM+Image Captioner+[V-CoT](https://github.com/ttengwang/Caption-Anything), and more.
28
+ - [12/05/2023] Release paper, code v0.0.1, and project page!
29
+
30
+ ## Environment Preparation
31
+
32
+ Please check [docs/ENV.md](docs/ENV.md).
33
+
34
+
35
+ ## Model Zoo
36
+
37
+ Please check [docs/MODEL_ZOO.md](docs/MODEL_ZOO.md)
38
+
39
+
40
+ ## Gradio Demo
41
+
42
+ Please check [docs/DEMO.md](docs/DEMO.md)
43
+
44
+
45
+ ## Running Training and Inference
46
+
47
+ Please check [docs/USAGE.md](docs/USAGE.md).
48
+
49
+
50
+ ## Experiments and Evaluation
51
+
52
+ Please check [docs/EVAL.md](docs/EVAL.md)
53
+
54
+ ## License
55
+
56
+ The trained weights are licensed under the [Apache 2.0 license](https://github.com/xk-huang/segment-caption-anything/blob/1c810bfcfeb3b95cd4b1f502f8f30c46333d58b8/LICENSE).
57
+
58
+ ## Acknowledgement
59
+
60
+ Deeply appreciate these wonderful open source projects: [transformers](https://github.com/huggingface/transformers), [accelerate](https://github.com/huggingface/accelerate), [deepspeed](https://github.com/microsoft/DeepSpeed), [detectron2](https://github.com/facebookresearch/detectron2), [hydra](https://github.com/facebookresearch/hydra), [timm](https://github.com/huggingface/pytorch-image-models), [gradio](https://github.com/gradio-app/gradio).
61
+
62
+ ## Citation
63
+
64
+ If you find this repository useful, please consider giving a star ⭐ and citation 🦖:
65
+
66
+ ```
67
+ @misc{xiaoke2023SCA,
68
+ title={{Segment and Caption Anything}},
69
+ author={Xiaoke, Huang and Jianfeng, Wang and Yansong, Tang and Zheng, Zhang and Han, Hu and Jiwen, Lu and Lijuan, Wang and Zicheng, Liu},
70
+ journal={arXiv},
71
+ volume={abs/2312.00869},
72
+ year={2023},
73
+ }
74
+ ```
amlt_configs/accelerate_config.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/docs/transformers/v4.32.1/en/main_classes/trainer#using-accelerate-launcher-with-trainer
2
+ compute_environment: LOCAL_MACHINE
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ gpu_ids: all
6
+ machine_rank: 0 # change this for each node
7
+ main_process_ip: node-0 # the machines on Azure is inter-connected, so you can directly config it according to `~/.ssh/config`
8
+ main_process_port: 11451 # change this as you like
9
+ main_training_function: main
10
+ mixed_precision: fp16
11
+ num_machines: 2 # change this for all nodes
12
+ num_processes: 8 # changet this for all nodes. all the gpu processes among the nodes.
13
+ rdzv_backend: static
14
+ same_network: true
15
+ tpu_env: []
16
+ tpu_use_cluster: false
17
+ tpu_use_sudo: false
18
+ use_cpu: false
amlt_configs/accelerate_deepspeed_config.local.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/docs/transformers/v4.32.1/en/main_classes/trainer#using-accelerate-launcher-with-trainer
2
+ compute_environment: LOCAL_MACHINE
3
+ deepspeed_config:
4
+ deepspeed_multinode_launcher: standard
5
+ gradient_accumulation_steps: 1
6
+ offload_optimizer_device: none
7
+ offload_param_device: none
8
+ zero3_init_flag: false
9
+ zero_stage: 2
10
+ gradient_clipping: 1.0
11
+ distributed_type: DEEPSPEED
12
+ downcast_bf16: 'no'
13
+ gpu_ids: all
14
+ machine_rank: 0 # change this for each node
15
+ main_process_ip: localhost # the machines on Azure is inter-connected, so you can directly config it according to `~/.ssh/config`
16
+ main_process_port: 11451 # change this as you like
17
+ main_training_function: main
18
+ mixed_precision: fp16
19
+ num_machines: 1 # change this for all nodes
20
+ num_processes: 1 # changet this for all nodes. all the gpu processes among the nodes.
21
+ rdzv_backend: static
22
+ same_network: true
23
+ tpu_env: []
24
+ tpu_use_cluster: false
25
+ tpu_use_sudo: false
26
+ use_cpu: false
amlt_configs/accelerate_deepspeed_config.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/docs/transformers/v4.32.1/en/main_classes/trainer#using-accelerate-launcher-with-trainer
2
+ compute_environment: LOCAL_MACHINE
3
+ deepspeed_config:
4
+ deepspeed_multinode_launcher: standard
5
+ gradient_accumulation_steps: 1
6
+ offload_optimizer_device: none
7
+ offload_param_device: none
8
+ zero3_init_flag: false
9
+ zero_stage: 2
10
+ gradient_clipping: 1.0
11
+ distributed_type: DEEPSPEED
12
+ downcast_bf16: 'no'
13
+ gpu_ids: all
14
+ machine_rank: 0 # change this for each node
15
+ main_process_ip: node-0 # the machines on Azure is inter-connected, so you can directly config it according to `~/.ssh/config`
16
+ main_process_port: 11451 # change this as you like
17
+ main_training_function: main
18
+ mixed_precision: fp16
19
+ num_machines: 2 # change this for all nodes
20
+ num_processes: 8 # changet this for all nodes. all the gpu processes among the nodes.
21
+ rdzv_backend: static
22
+ same_network: true
23
+ tpu_env: []
24
+ tpu_use_cluster: false
25
+ tpu_use_sudo: false
26
+ use_cpu: false
amlt_configs/debug-data_mount.yaml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+ SHARED_CMD_ARGS: '
4
+ -m src.train
5
+ train_data=[vg-densecap-region_descriptions] eval_data=[vg-densecap-region_descriptions]
6
+ +model=base_sam_captioner
7
+ training.do_train=False
8
+ training.do_eval=False
9
+ training.do_inference=True
10
+ training.num_masks_per_sample=1
11
+ +data.streaming=False
12
+ training.max_eval_samples=10
13
+ training.max_train_samples=1
14
+ training.num_train_epochs=10
15
+ training.fp16=True
16
+ training.output_dir=$AMLT_OUTPUT_DIR
17
+ training.output_log_dir=$AMLT_LOGS_DIR
18
+ model.cache_dir=/mnt/blob/weights/.model.cache/
19
+ training.dataloader_num_workers=4
20
+ '
21
+
22
+
23
+ environment:
24
+
25
+ image: nvidia/pytorch:23.07-py3
26
+ registry: nvcr.io
27
+
28
+ code:
29
+ local_dir: $CONFIG_DIR/../
30
+
31
+
32
+
33
+ jobs:
34
+ - name: sam_captioner-infer-debug
35
+ sku: G$NUM_GPUS
36
+ preemptible: False
37
+ process_count_per_node: 1 # Each node should run 1 process
38
+ command:
39
+ - . amlt_configs/setup.sh
40
+ - source ~/.bashrc
41
+ - . amlt_configs/setup_accelerate_on_azure.sh
42
+ - . amlt_configs/post_process.sh
43
+ - accelerate launch --config_file amlt_configs/accelerate_config.yaml $SHARED_CMD_ARGS || . amlt_configs/post_process.sh
44
+ - . amlt_configs/post_process.sh
45
+
46
+
47
+ submit_args:
48
+ env:
49
+ AZFUSE_USE_FUSE: "1"
50
+ SHARED_MEMORY_PERCENT: 0.5
51
+ container_args:
52
+ shm_size: 256g
amlt_configs/debug-sca.yaml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+ SHARED_CMD_ARGS: >
4
+ -m src.train
5
+ train_data='[vg-densecap-region_descriptions]' eval_data='[vg-densecap-region_descriptions]'
6
+ +model=base_sca
7
+ training.do_train=True
8
+ training.do_eval=True
9
+ training.do_inference=True
10
+ +data.streaming=False
11
+ training.max_eval_samples=800
12
+ training.max_steps=200000
13
+ training.fp16=True
14
+ model.cache_dir=/mnt/blob/weights/.model.cache/
15
+ training.save_strategy=steps
16
+ training.save_steps=5000
17
+ training.save_total_limit=3
18
+ training.optim=adamw_torch
19
+ training.evaluate_before_train=True
20
+ training.per_device_train_batch_size=1
21
+ training.evaluation_strategy=steps
22
+ training.eval_steps=5000
23
+ training.logging_steps=1000
24
+ training.logging_first_step=True
25
+ training.lr_scheduler_type=constant
26
+ training.warmup_steps=2000
27
+ training.learning_rate=1e-4
28
+ model.lm_head_model_name_or_path=gpt2-large
29
+ training.dataloader_num_workers=4
30
+ training.num_masks_per_sample=8
31
+ model.num_caption_tokens=8
32
+ training.output_dir=$AMLT_OUTPUT_DIR
33
+ training.output_log_dir=$AMLT_LOGS_DIR
34
+ wandb.group=$AMLT_EXPERIMENT_NAME-$AMLT_DESCRIPTION
35
+ wandb.name=$AMLT_JOB_NAME
36
+
37
+
38
+ environment:
39
+
40
+ image: nvidia/pytorch:23.07-py3
41
+ registry: nvcr.io
42
+
43
+ code:
44
+ local_dir: $CONFIG_DIR/../
45
+
46
+
47
+
48
+ jobs:
49
+ - name: sca-debug
50
+ sku: G$NUM_GPUS
51
+ process_count_per_node: 1 # Each node should run 1 process
52
+ preemptible: False
53
+ command:
54
+ - . amlt_configs/setup.sh
55
+ - source ~/.bashrc
56
+ - . amlt_configs/setup_accelerate_on_azure.sh
57
+ - . amlt_configs/post_process.sh
58
+ # - accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml $SHARED_CMD_ARGS || . amlt_configs/post_process.sh
59
+
60
+ submit_args:
61
+ env:
62
+ AZFUSE_USE_FUSE: "1"
63
+ SHARED_MEMORY_PERCENT: 0.5
64
+ container_args:
65
+ shm_size: 256g
amlt_configs/debug.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+ SHARED_CMD_ARGS: '
4
+ -m src.train
5
+ train_data=[vg-densecap-region_descriptions] eval_data=[vg-densecap-region_descriptions]
6
+ +model=base_sam_captioner
7
+ training.do_train=False
8
+ training.do_eval=False
9
+ training.do_inference=True
10
+ training.num_masks_per_sample=1
11
+ +data.streaming=False
12
+ training.max_eval_samples=10
13
+ training.max_train_samples=1
14
+ training.num_train_epochs=10
15
+ training.fp16=True
16
+ training.output_dir=$AMLT_OUTPUT_DIR
17
+ training.output_log_dir=$AMLT_LOGS_DIR
18
+ model.cache_dir=/mnt/blob/weights/.model.cache/
19
+ training.dataloader_num_workers=4
20
+ '
21
+
22
+
23
+
24
+ environment:
25
+ image: nvidia/pytorch:23.07-py3
26
+ registry: nvcr.io
27
+
28
+ code:
29
+ local_dir: $CONFIG_DIR/../
30
+
31
+
32
+
33
+ jobs:
34
+ - name: sam_captioner-infer-debug
35
+ sku: G$NUM_GPUS
36
+ preemptible: False
37
+ process_count_per_node: 1 # Each node should run 1 process
38
+ command:
39
+ - . amlt_configs/setup.sh
40
+ - source ~/.bashrc
41
+ - . amlt_configs/setup_accelerate_on_azure.sh
42
+ - . amlt_configs/post_process.sh
43
+ # - accelerate launch --config_file amlt_configs/accelerate_config.yaml $SHARED_CMD_ARGS || . amlt_configs/post_process.sh
44
+
45
+
46
+ submit_args:
47
+ env:
48
+ AZFUSE_USE_FUSE: "1"
49
+ SHARED_MEMORY_PERCENT: 0.5
50
+ container_args:
51
+ shm_size: 256g
amlt_configs/infer-sam_captioner-region_chunkify-eval_suite.yaml ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+ SHARED_CMD_ARGS: '
4
+ -m src.train
5
+ train_data=[vg-densecap-region_descriptions] eval_data=[vg-densecap-region_descriptions]
6
+ +model=base_sam_captioner
7
+ training.do_train=False
8
+ training.do_eval=False
9
+ training.do_inference=True
10
+ +data.streaming=False
11
+ training.fp16=True
12
+ training.output_dir=$AMLT_OUTPUT_DIR
13
+ training.output_log_dir=$AMLT_LOGS_DIR
14
+ model.cache_dir=/mnt/blob/weights/.model.cache/
15
+ training.dataloader_num_workers=4
16
+ '
17
+
18
+ environment:
19
+
20
+ image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
21
+ registry: nvcr.io
22
+
23
+ code:
24
+ local_dir: $CONFIG_DIR/../
25
+
26
+
27
+
28
+ # Salesforce/blip-image-captioning-large, Salesforce/blip-image-captioning-base, microsoft/git-large-coco, microsoft/git-large-textcaps, microsoft/git-large, microsoft/git-base-coco, microsoft/git-base-textcaps, microsoft/git-base
29
+ # LM_MODEL='Salesforce/blip-image-captioning-large' && amlt run config.yaml :Salesforce/blip-image-captioning-large=$LM_MODEL --extra-args "model.captioner_model_name_or_path=$LM_MODEL"
30
+ jobs:
31
+ - name: Salesforce/blip-image-captioning-large
32
+ sku: G$NUM_GPUS
33
+ preemptible: False
34
+ command:
35
+ - . amlt_configs/setup.sh
36
+ - source ~/.bashrc
37
+ - pip install pydantic==1.10.8 # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
38
+ - . amlt_configs/setup_eval_suite.sh
39
+ - . amlt_configs/setup_accelerate_on_azure.sh
40
+
41
+ # caption
42
+ - DATASET=vg-densecap-region_descriptions
43
+ - accelerate launch --num_processes $NUM_GPUS $SHARED_CMD_ARGS model.captioner_model_name_or_path=Salesforce/blip-image-captioning-large train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
44
+ - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-vg-densecap-region_descriptions/region_img_annot_caption/visual_genome.py-region_descriptions_v1.2.0-test.region_img.tsv
45
+
46
+ - DATASET=refcoco-google
47
+ - accelerate launch --num_processes $NUM_GPUS $SHARED_CMD_ARGS model.captioner_model_name_or_path=Salesforce/blip-image-captioning-large train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
48
+ - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-refcoco-google/region_img_annot_caption/refcoco.py-refcoco-google-test.region_img.tsv
49
+
50
+ # concept
51
+ - DATASET=coco-instance
52
+ - accelerate launch --num_processes $NUM_GPUS $SHARED_CMD_ARGS model.captioner_model_name_or_path=Salesforce/blip-image-captioning-large train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
53
+ - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
54
+
55
+ # OOM and every slow
56
+ # - DATASET=objects365-local
57
+ # - accelerate launch --num_processes $NUM_GPUS $SHARED_CMD_ARGS model.captioner_model_name_or_path=Salesforce/blip-image-captioning-large train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
58
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
59
+
60
+ # OOM and every slow
61
+ # - DATASET=v3det-local
62
+ # - accelerate launch --num_processes $NUM_GPUS $SHARED_CMD_ARGS model.captioner_model_name_or_path=Salesforce/blip-image-captioning-large train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
63
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
64
+
65
+ submit_args:
66
+ env:
67
+ SHARED_MEMORY_PERCENT: 0.5
68
+ container_args:
69
+ shm_size: 256g
amlt_configs/infer-sca-eval_suite-ckpt.yaml ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+ SHARED_CMD_ARGS: >-
4
+ -m src.train
5
+ +model=base_sca
6
+ training.do_train=False
7
+ training.do_eval=False
8
+ training.do_inference=True
9
+ training.fp16=True
10
+ wandb.log=False
11
+ training.dataloader_num_workers=4
12
+ training.output_log_dir=$AMLT_LOGS_DIR
13
+ model.cache_dir=/mnt/blob/weights/.model.cache/
14
+
15
+
16
+
17
+
18
+
19
+ environment:
20
+
21
+ image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
22
+ registry: nvcr.io
23
+
24
+ code:
25
+ local_dir: $CONFIG_DIR/../
26
+
27
+
28
+
29
+ jobs:
30
+ - name: infer-eval_suite
31
+ sku: G$NUM_GPUS
32
+ preemptible: False
33
+ command:
34
+ - . amlt_configs/setup.sh
35
+ - source ~/.bashrc
36
+ - pip install pydantic==1.10.8 # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
37
+ - . amlt_configs/setup_eval_suite.sh
38
+ - . amlt_configs/setup_accelerate_on_azure.sh
39
+
40
+ # caption
41
+ - DATASET=vg-densecap-local
42
+ - >-
43
+ accelerate launch $SHARED_CMD_ARGS
44
+ train_data=[$$DATASET]
45
+ eval_data=[$$DATASET]
46
+ training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
47
+ $EXTRA_ARGS
48
+ - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
49
+
50
+ - DATASET=refcocog-google
51
+ - >-
52
+ accelerate launch $SHARED_CMD_ARGS
53
+ train_data=[$$DATASET]
54
+ eval_data=[$$DATASET]
55
+ training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
56
+ $EXTRA_ARGS
57
+ - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
58
+
59
+ - DATASET=refcoco-unc-split_testA
60
+ - >-
61
+ accelerate launch $SHARED_CMD_ARGS
62
+ train_data=[$$DATASET]
63
+ eval_data=[$$DATASET]
64
+ training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
65
+ $EXTRA_ARGS
66
+ - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
67
+
68
+ - DATASET=refcoco-unc-split_testB
69
+ - >-
70
+ accelerate launch $SHARED_CMD_ARGS
71
+ train_data=[$$DATASET]
72
+ eval_data=[$$DATASET]
73
+ training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
74
+ $EXTRA_ARGS
75
+ - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
76
+
77
+ - DATASET=refcoco+-unc-split_testA
78
+ - >-
79
+ accelerate launch $SHARED_CMD_ARGS
80
+ train_data=[$$DATASET]
81
+ eval_data=[$$DATASET]
82
+ training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
83
+ $EXTRA_ARGS
84
+ - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
85
+
86
+ - DATASET=refcoco+-unc-split_testB
87
+ - >-
88
+ accelerate launch $SHARED_CMD_ARGS
89
+ train_data=[$$DATASET]
90
+ eval_data=[$$DATASET]
91
+ training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
92
+ $EXTRA_ARGS
93
+ - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
94
+
95
+ # concept
96
+ # - DATASET=coco-instance
97
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
98
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
99
+
100
+ # OOM and every slow
101
+ # - DATASET=objects365-local
102
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
103
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
104
+
105
+ # OOM and every slow
106
+ # - DATASET=v3det-local
107
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
108
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
109
+
110
+ submit_args:
111
+ env:
112
+ SHARED_MEMORY_PERCENT: 0.5
113
+ container_args:
114
+ shm_size: 256g
115
+
116
+ # CKPT_PATHS=(
117
+ # /mnt/blob/weights/sca-weights.111823/finetune-gpt2_large-lr_1e_4-1xlr-lsj-bs_1-pretrain_1e_4_no_lsj_bs_32.111223.rr1-4x8-v100-32g-pre/checkpoint-100000
118
+ # /mnt/blob/weights/sca-weights.111823/gpt2-large-lsj-1xlr.110423.octo-4x8-v100-16g-no_pre/checkpoint-200000
119
+ # /mnt/blob/weights/sca-weights.111823/ollm3bv2-large-lsj-1xlr.110423.octo-4x8-v100-16g-no_pre/checkpoint-200000
120
+ # /mnt/blob/weights/sca-weights.111823/pretrain_1e_4_no_lsj_bs_32.110523.rr1-4x8-v100-32g-pre/checkpoint-100000
121
+ # )
122
+ # for CKPT_PATH in ${CKPT_PATHS[@]} ; do
123
+ # CKPT_NAME=$(basename $(dirname $CKPT_PATH))
124
+ # echo $CKPT_NAME
125
+ # amlt run \
126
+ # -d "" --extra-args "training.generation_num_beams=3 training.fp16_full_eval=True model.model_name_or_path=$CKPT_PATH model.lm_head_model_name_or_path=\$(python scripts/tools/get_sub_model_name_from_ckpt.py $CKPT_PATH lm) model.sam_model_name_or_path=facebook/sam-vit-huge" \
127
+ # -t msroctovc -w msroctows --no-pre \
128
+ # --sku G4-V100 \
129
+ # amlt_configs/infer-sca-eval_suite-ckpt.yaml \
130
+ # :0=$CKPT_NAME \
131
+ # `date +"%m%d%y"`.infer-ckpt-all_dataset \
132
+ # -y
133
+ # done
amlt_configs/infer-sca-eval_suite-coco_instance_task_type_caption-last_model.yaml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+ SHARED_CMD_ARGS: >-
4
+ -m src.train
5
+ +model=base_sca
6
+ training.do_train=False
7
+ training.do_eval=False
8
+ training.do_inference=True
9
+ training.fp16=True
10
+ training.output_log_dir=$AMLT_LOGS_DIR
11
+ model.cache_dir=/mnt/blob/weights/.model.cache/
12
+ wandb.log=False
13
+ training.dataloader_num_workers=4
14
+
15
+
16
+
17
+
18
+ environment:
19
+
20
+ image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
21
+ registry: nvcr.io
22
+
23
+ code:
24
+ local_dir: $CONFIG_DIR/../
25
+
26
+
27
+
28
+ jobs:
29
+ - name: infer-eval_suite
30
+ sku: G$NUM_GPUS
31
+ preemptible: False
32
+ command:
33
+ - . amlt_configs/setup.sh
34
+ - source ~/.bashrc
35
+ - pip install pydantic==1.10.8 # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
36
+ - . amlt_configs/setup_eval_suite.sh
37
+ - . amlt_configs/setup_accelerate_on_azure.sh
38
+
39
+ # get best (or max step) model
40
+ - BEST_CKPT_STEP=$$(python scripts/tools/get_model_name_from_trainer_state.py $$AMLT_MAP_INPUT_DIR "last")
41
+ - BEST_CKPT_PATH=$$(find $$AMLT_MAP_INPUT_DIR -name '*checkpoint*' | grep $$BEST_CKPT_STEP | tail -n1)
42
+
43
+ # caption
44
+ - DATASET=coco-instance-task_type_caption-local
45
+ - >-
46
+ accelerate launch $SHARED_CMD_ARGS
47
+ train_data=[$$DATASET]
48
+ eval_data=[$$DATASET]
49
+ training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
50
+ model.model_name_or_path=$$BEST_CKPT_PATH
51
+ model.lm_head_model_name_or_path=$(python scripts/tools/get_sub_model_name_from_ckpt.py $$BEST_CKPT_PATH "lm")
52
+ model.sam_model_name_or_path=facebook/sam-vit-huge
53
+ $EXTRA_ARGS
54
+ - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-coco-instance-local/region_img_annot_caption/coco_instance-local.py-2017-validation.region_img.tsv
55
+
56
+ # - DATASET=refcocog-google
57
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
58
+ # - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-refcocog-google/region_img_annot_caption/refcoco.py-refcocog-google-validation.region_img.tsv
59
+
60
+ # - DATASET=refcoco-unc-split_testA
61
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
62
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
63
+
64
+ # - DATASET=refcoco-unc-split_testB
65
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
66
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
67
+
68
+ # - DATASET=refcoco+-unc-split_testA
69
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
70
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
71
+
72
+ # - DATASET=refcoco+-unc-split_testB
73
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
74
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
75
+
76
+ # concept
77
+ # - DATASET=coco-instance
78
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
79
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
80
+
81
+ # OOM and every slow
82
+ # - DATASET=objects365-local
83
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
84
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
85
+
86
+ # OOM and every slow
87
+ # - DATASET=v3det-local
88
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
89
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
90
+
91
+ submit_args:
92
+ env:
93
+ SHARED_MEMORY_PERCENT: 0.5
94
+ container_args:
95
+ shm_size: 256g
amlt_configs/infer-sca-eval_suite-vg-best_model.yaml ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+ SHARED_CMD_ARGS: >-
4
+ -m src.train
5
+ +model=base_sca
6
+ training.do_train=False
7
+ training.do_eval=False
8
+ training.do_inference=True
9
+ training.fp16=True
10
+ training.output_log_dir=$AMLT_LOGS_DIR
11
+ model.cache_dir=/mnt/blob/weights/.model.cache/
12
+ wandb.log=False
13
+ training.dataloader_num_workers=4
14
+
15
+
16
+
17
+
18
+ environment:
19
+
20
+ image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
21
+ registry: nvcr.io
22
+
23
+ code:
24
+ local_dir: $CONFIG_DIR/../
25
+
26
+
27
+
28
+ jobs:
29
+ - name: infer-eval_suite
30
+ sku: G$NUM_GPUS
31
+ preemptible: False
32
+ command:
33
+ - . amlt_configs/setup.sh
34
+ - source ~/.bashrc
35
+ - pip install pydantic==1.10.8 # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
36
+ - . amlt_configs/setup_eval_suite.sh
37
+ - . amlt_configs/setup_accelerate_on_azure.sh
38
+
39
+ # get best (or max step) model
40
+ - BEST_CKPT_STEP=$$(python scripts/tools/get_model_name_from_trainer_state.py $$AMLT_MAP_INPUT_DIR "best")
41
+ - BEST_CKPT_PATH=$$(find $$AMLT_MAP_INPUT_DIR -name '*checkpoint*' | grep $$BEST_CKPT_STEP | tail -n1)
42
+
43
+ # caption
44
+ - DATASET=vg-densecap-region_descriptions
45
+ - >-
46
+ accelerate launch $SHARED_CMD_ARGS
47
+ train_data=[$$DATASET]
48
+ eval_data=[$$DATASET]
49
+ training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
50
+ model.model_name_or_path=$$BEST_CKPT_PATH
51
+ model.lm_head_model_name_or_path=$$(python scripts/tools/get_sub_model_name_from_ckpt.py $$BEST_CKPT_PATH "lm")
52
+ model.sam_model_name_or_path=facebook/sam-vit-huge
53
+ $EXTRA_ARGS
54
+
55
+ - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-vg-densecap-region_descriptions/region_img_annot_caption/visual_genome.py-region_descriptions_v1.2.0-test.region_img.tsv
56
+
57
+ # - DATASET=refcocog-google
58
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
59
+ # - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-refcocog-google/region_img_annot_caption/refcoco.py-refcocog-google-validation.region_img.tsv
60
+
61
+ # - DATASET=refcoco-unc-split_testA
62
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
63
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
64
+
65
+ # - DATASET=refcoco-unc-split_testB
66
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
67
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
68
+
69
+ # - DATASET=refcoco+-unc-split_testA
70
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
71
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
72
+
73
+ # - DATASET=refcoco+-unc-split_testB
74
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
75
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
76
+
77
+ # concept
78
+ # - DATASET=coco-instance
79
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
80
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
81
+
82
+ # OOM and every slow
83
+ # - DATASET=objects365-local
84
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
85
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
86
+
87
+ # OOM and every slow
88
+ # - DATASET=v3det-local
89
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
90
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
91
+
92
+ submit_args:
93
+ env:
94
+ SHARED_MEMORY_PERCENT: 0.5
95
+ container_args:
96
+ shm_size: 256g
amlt_configs/infer-sca-eval_suite-vg-last_model.yaml ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+ SHARED_CMD_ARGS: >-
4
+ -m src.train
5
+ +model=base_sca
6
+ training.do_train=False
7
+ training.do_eval=False
8
+ training.do_inference=True
9
+ training.fp16=True
10
+ training.output_log_dir=$AMLT_LOGS_DIR
11
+ model.cache_dir=/mnt/blob/weights/.model.cache/
12
+ wandb.log=False
13
+ training.dataloader_num_workers=4
14
+
15
+
16
+
17
+
18
+ environment:
19
+
20
+ image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
21
+ registry: nvcr.io
22
+
23
+ code:
24
+ local_dir: $CONFIG_DIR/../
25
+
26
+
27
+
28
+ jobs:
29
+ - name: infer-eval_suite
30
+ sku: G$NUM_GPUS
31
+ preemptible: False
32
+ command:
33
+ - . amlt_configs/setup.sh
34
+ - source ~/.bashrc
35
+ - pip install pydantic==1.10.8 # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
36
+ - . amlt_configs/setup_eval_suite.sh
37
+ - . amlt_configs/setup_accelerate_on_azure.sh
38
+
39
+ # get best (or max step) model
40
+ - BEST_CKPT_STEP=$$(python scripts/tools/get_model_name_from_trainer_state.py $$AMLT_MAP_INPUT_DIR "last")
41
+ - BEST_CKPT_PATH=$$(find $$AMLT_MAP_INPUT_DIR -name '*checkpoint*' | grep $$BEST_CKPT_STEP | tail -n1)
42
+
43
+ # caption
44
+ - DATASET=vg-densecap-region_descriptions
45
+ - >-
46
+ accelerate launch $SHARED_CMD_ARGS
47
+ train_data=[$$DATASET]
48
+ eval_data=[$$DATASET]
49
+ training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
50
+ model.model_name_or_path=$$BEST_CKPT_PATH
51
+ model.lm_head_model_name_or_path=$$(python scripts/tools/get_sub_model_name_from_ckpt.py $$BEST_CKPT_PATH "lm")
52
+ model.sam_model_name_or_path=facebook/sam-vit-huge
53
+ $EXTRA_ARGS
54
+
55
+ - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-vg-densecap-region_descriptions/region_img_annot_caption/visual_genome.py-region_descriptions_v1.2.0-test.region_img.tsv
56
+
57
+ # - DATASET=refcocog-google
58
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
59
+ # - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-refcocog-google/region_img_annot_caption/refcoco.py-refcocog-google-validation.region_img.tsv
60
+
61
+ # - DATASET=refcoco-unc-split_testA
62
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
63
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
64
+
65
+ # - DATASET=refcoco-unc-split_testB
66
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
67
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
68
+
69
+ # - DATASET=refcoco+-unc-split_testA
70
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
71
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
72
+
73
+ # - DATASET=refcoco+-unc-split_testB
74
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
75
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
76
+
77
+ # concept
78
+ # - DATASET=coco-instance
79
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
80
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
81
+
82
+ # OOM and every slow
83
+ # - DATASET=objects365-local
84
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
85
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
86
+
87
+ # OOM and every slow
88
+ # - DATASET=v3det-local
89
+ # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
90
+ # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
91
+
92
+ submit_args:
93
+ env:
94
+ SHARED_MEMORY_PERCENT: 0.5
95
+ container_args:
96
+ shm_size: 256g
amlt_configs/post_process.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ echo "The main process failed, enter post_process.sh"
2
+ python -c "import time;time.sleep(100000)"
amlt_configs/setup.sh ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #/bin/bash
2
+
3
+ # Uninstall mlflow
4
+ # pip uninstall -y mlflow-skinny
5
+ # pip uninstall -y azureml-mlflow
6
+
7
+ nvidia-smi
8
+
9
+ # Download azcopy
10
+ TMP_DIR=tmp/
11
+ AZCOPY_URL=https://aka.ms/downloadazcopy-v10-linux
12
+ AZCOPY_TAR_FILE="$TMP_DIR/azcopy-v10-linux.tar.gz"
13
+ AZCOPY_FILE="$TMP_DIR/azcopy"
14
+
15
+ "$AZCOPY_FILE" --version
16
+ has_azcopy=$?
17
+
18
+ if [[ has_azcopy -eq 0 ]]; then
19
+ echo "azcopy exists"
20
+ else
21
+ echo "azcopy does not exist"
22
+ mkdir -p $TMP_DIR
23
+ wget $AZCOPY_URL -O $AZCOPY_TAR_FILE
24
+ file_to_be_extracted="$(tar -tvf $AZCOPY_TAR_FILE | grep -E 'azcopy$' | awk '{print $6}')"
25
+ tar -zxvf $AZCOPY_TAR_FILE -C "$TMP_DIR" "$file_to_be_extracted"
26
+ mv $TMP_DIR/$file_to_be_extracted $TMP_DIR
27
+ rm $AZCOPY_TAR_FILE
28
+ rmdir "$(dirname $TMP_DIR/$file_to_be_extracted)"
29
+ chmod 777 $AZCOPY_FILE
30
+ export PATH=$PATH:$(pwd)/$TMP_DIR
31
+ echo "export PATH=\$PATH:$(pwd)/$TMP_DIR" >> ~/.bashrc
32
+ fi
33
+
34
+ # Install pip requirements
35
+ pip install -r requirements.txt
36
+ echo "export PATH=\$PATH:\$HOME/.local/bin" >> ~/.bashrc
37
+ export PATH=$PATH:$HOME/.local/bin
38
+
39
+ # Add wandb api
40
+ # ref: https://docs.wandb.ai/guides/track/environment-variables
41
+ MY_WANDB_API_KEY= 'YOUR_WANDB_API_KEY'
42
+ export WANDB_API_KEY=$MY_WANDB_API_KEY
43
+ echo "export WANDB_API_KEY=$MY_WANDB_API_KEY" >> ~/.bashrc
44
+
45
+ # Show full error trace from hydra
46
+ echo "export HYDRA_FULL_ERROR=1" >> ~/.bashrc
47
+
48
+ # Change dataset to hg download
49
+ TARGET_DATASETS_VER="2.13.1"
50
+ version="$(pip show datasets | grep Version | awk '{print $2}')"
51
+ if [[ $version == $TARGET_DATASETS_VER ]]; then
52
+ echo "datasets version is $TARGET_DATASETS_VER, changing it to use azcopy..."
53
+ pip_package_path="$(pip show datasets | grep Location | awk '{print $2}')"
54
+ download_file_path="$pip_package_path/datasets/utils/file_utils.py"
55
+ if [[ -f $download_file_path.bak ]]; then
56
+ cp $download_file_path.bak $download_file_path
57
+ fi
58
+ cp $download_file_path $download_file_path.bak
59
+ sed -i '609 i\
60
+ # NOTE(xiaoke): An intrusion to use azcopy to download from Azure blob storage\
61
+ elif "blob.core.windows.net" in url:\
62
+ process_id = -1\
63
+ try:\
64
+ import torch\
65
+ if torch.distributed.is_initialized():\
66
+ process_id = torch.distributed.get_rank()\
67
+ except ImportError:\
68
+ logger.warning("no torch found, cannot determine whether is in ddp mode")\
69
+ except RuntimeError:\
70
+ logger.warning("torch.distributed is not initialized, cannot determine whether is in ddp mode")\
71
+ \
72
+ logger.warning(f"[process {process_id}] Try to use azcopy to download from Azure blob storage")\
73
+ import subprocess\
74
+ \
75
+ has_azcopy = subprocess.run(["azcopy"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).returncode\
76
+ if has_azcopy != 0:\
77
+ logger.warning(f"[process {process_id}] azcopy not found, using http_get, which is slow")\
78
+ http_get(\
79
+ url,\
80
+ temp_file,\
81
+ proxies=proxies,\
82
+ resume_size=resume_size,\
83
+ headers=headers,\
84
+ cookies=cookies,\
85
+ max_retries=max_retries,\
86
+ desc=download_desc,\
87
+ )\
88
+ else:\
89
+ logger.warning(f"[process {process_id}] azcopy found, using azcopy")\
90
+ result = subprocess.run(\
91
+ ["azcopy", "cp", url, temp_file.name],\
92
+ )\
93
+ if result.returncode != 0:\
94
+ raise ConnectionError(\
95
+ f"azcopy failed with return code {result.returncode}"\
96
+ )\
97
+ ' $download_file_path
98
+ else
99
+ echo "datasets version is NOT $TARGET_DATASETS_VER, not changed"
100
+ fi
101
+
102
+ # For debug
103
+ sudo apt-get update
104
+ if [[ $? -ne 0 ]]; then
105
+ apt-get update
106
+ fi
107
+ sudo apt-get install -y tmux htop vim lsof
108
+ if [[ $? -ne 0 ]]; then
109
+ apt-get install -y tmux htop vim lsof
110
+ fi
111
+
112
+ # Tmux config
113
+ curl -L https://raw.githubusercontent.com/hamvocke/dotfiles/master/tmux/.tmux.conf -o - >> ~/.tmux.conf
114
+
115
+ # Vim config
116
+ # Install vim-plug
117
+ curl -fLo ~/.vim/autoload/plug.vim --create-dirs \
118
+ https://raw.githubusercontent.com/junegunn/vim-plug/master/plug.vim
119
+
120
+ cat << EOF > ~/.vimrc
121
+ set tabstop=4
122
+ set shiftwidth=4
123
+ set expandtab
124
+ set smartindent
125
+ set nu
126
+ set hlsearch
127
+ set ignorecase
128
+ set mouse=a
129
+
130
+ call plug#begin()
131
+ Plug 'tpope/vim-surround'
132
+ Plug 'tpope/vim-commentary'
133
+ Plug 'davidhalter/jedi-vim'
134
+ call plug#end()
135
+
136
+ let g:jedi#force_py_version = 3 " Force using Python 3
137
+ EOF
138
+ vim +'PlugInstall --sync' +qa
139
+
140
+ # Install gpustat
141
+ pip install gpustat
142
+
143
+ # echo pwd
144
+ echo "pwd: $(pwd)"
amlt_configs/setup_accelerate_on_azure.sh ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ source ~/.bashrc
3
+
4
+ ACCELERATE_CONFIG_PATHS=(amlt_configs/accelerate_config.yaml amlt_configs/accelerate_deepspeed_config.yaml)
5
+ if [[ -z "$WORLD_SIZE" ]]; then
6
+ echo "WORLD_SIZE is not set, using 1"
7
+ WORLD_SIZE=1
8
+ fi
9
+ if [[ -z "$NODE_RANK" ]]; then
10
+ echo "NODE_RANK is not set, using 0"
11
+ NODE_RANK=0
12
+ fi
13
+ NUM_GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
14
+ ((NUM_TOTAL_GPUS = WORLD_SIZE * NUM_GPUS_PER_NODE))
15
+
16
+ echo "Setting up accelerate config:"
17
+ echo "ACCELERATE_CONFIG_PATHS: ${ACCELERATE_CONFIG_PATHS[@]}"
18
+ echo "NUM_TOTAL_GPUS: $NUM_TOTAL_GPUS"
19
+ echo "NUM_GPUS_PER_NODE: $NUM_GPUS_PER_NODE"
20
+ echo "WORLD_SIZE: $WORLD_SIZE"
21
+ echo "NODE_RANK: $NODE_RANK"
22
+ echo "MASTER_ADDR: $MASTER_ADDR"
23
+ echo "MASTER_PORT: $MASTER_PORT"
24
+
25
+ function modify_accelerate_config()
26
+ {
27
+ local ACCELERATE_CONFIG_PATH=$1
28
+ if [[ -z "$MASTER_ADDR" ]]; then
29
+ echo "MASTER_ADDR is not set, using localhost"
30
+ sed -i 's/main_process_ip.*//g' $ACCELERATE_CONFIG_PATH
31
+ sed -i 's/main_process_port.*//g' $ACCELERATE_CONFIG_PATH
32
+ else
33
+ sed -i 's/main_process_ip.*/main_process_ip: '"$MASTER_ADDR"'/g' $ACCELERATE_CONFIG_PATH
34
+ sed -i 's/main_process_port.*/main_process_port: '"$MASTER_PORT"'/g' $ACCELERATE_CONFIG_PATH
35
+ fi
36
+
37
+ sed -i 's/num_machines.*/num_machines: '"$WORLD_SIZE"'/g' $ACCELERATE_CONFIG_PATH
38
+ sed -i 's/machine_rank.*/machine_rank: '"$NODE_RANK"'/g' $ACCELERATE_CONFIG_PATH
39
+
40
+ sed -i 's/num_processes.*/num_processes: '"$NUM_TOTAL_GPUS"'/g' $ACCELERATE_CONFIG_PATH
41
+
42
+ accelerate env --config_file $ACCELERATE_CONFIG_PATH
43
+ # accelerate test --config_file $ACCELERATE_CONFIG_PATH # It may cause bug..ValueError: To use a `DataLoader` in `split_batches` mode, the batch size (8) needs to be a round multiple of the number of processes (16).
44
+ }
45
+
46
+ for ACCELERATE_CONFIG_PATH in "${ACCELERATE_CONFIG_PATHS[@]}"; do
47
+ if [[ -f "$ACCELERATE_CONFIG_PATH" ]]; then
48
+ echo "ACCELERATE_CONFIG_PATH: $ACCELERATE_CONFIG_PATH exists, modifying it with env variables."
49
+ modify_accelerate_config $ACCELERATE_CONFIG_PATH
50
+ else
51
+ echo "ACCELERATE_CONFIG_PATH: $ACCELERATE_CONFIG_PATH does not exist"
52
+ fi
53
+ done
amlt_configs/setup_eval_suite.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ source ~/.bashrc
3
+
4
+ ORIGINAL_DIR="$(pwd)"
5
+ git clone --recursive https://github.com/xk-huang/vdtk.git /tmp/vdtk -b dev
6
+ cd /tmp/vdtk
7
+ git submodule update --init --recursive
8
+
9
+ apt-get update
10
+ sudo apt-get update
11
+ apt-get install git-lfs gawk
12
+ sudo apt-get install git-lfs gawk
13
+
14
+ git lfs install
15
+ git clone https://huggingface.co/xk-huang/vdtk-data
16
+ # git submodule init && git submodule update
17
+
18
+ rsync -avP ./vdtk-data/vdtk .
19
+ rm -rf vdtk-data
20
+
21
+ pip install --upgrade pip
22
+ pip install -e . POT==0.9.0 # POT=0.9.1 will take up all the memory with tf backend
23
+ pip install tensorflow==2.12.1 # Just fix one version of tf
24
+ pip install levenshtein==0.21.1
25
+ pip install openpyxl==3.1.2
26
+
27
+ python3 -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"
28
+ cd "$ORIGINAL_DIR"
amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+
4
+ # NOTE: this kind of string leaded by > will append a new line to the end of the string
5
+ SHARED_CMD_ARGS: >-
6
+ -m src.train
7
+ +model=base_sca_multitask_v2
8
+ training.do_train=True
9
+ training.do_eval=True
10
+ training.do_inference=True
11
+ +data.streaming=False
12
+ training.max_eval_samples=800
13
+ training.max_steps=100000
14
+ training.fp16=True
15
+ training.output_dir=$AMLT_OUTPUT_DIR
16
+ training.output_log_dir=$AMLT_LOGS_DIR
17
+ model.cache_dir=/mnt/blob/weights/.model.cache/
18
+ training.save_strategy=steps
19
+ training.save_steps=5000
20
+ training.save_total_limit=3
21
+ training.optim=adamw_torch
22
+ training.evaluate_before_train=True
23
+ training.per_device_train_batch_size=1
24
+ training.evaluation_strategy=steps
25
+ training.eval_steps=5000
26
+ training.logging_steps=1000
27
+ training.logging_first_step=True
28
+ training.dataloader_num_workers=4
29
+ training.num_masks_per_sample=16
30
+ wandb.project=$AMLT_EXPERIMENT_NAME
31
+ wandb.name=$AMLT_JOB_NAME
32
+ model.num_caption_tokens=8
33
+ model.additional_num_hidden_layers=12
34
+ model.num_task_tokens=6
35
+ training.lr_scheduler_type=cosine
36
+ model.lm_head_model_name_or_path=gpt2-large
37
+ training.learning_rate=1e-5
38
+ training.weight_decay=1e-4
39
+ training.warmup_steps=200
40
+ training.warmup_ratio=0.33333333
41
+ training.compute_metrics=True
42
+
43
+
44
+
45
+ environment:
46
+
47
+ image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
48
+ # image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
49
+ # image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
50
+ registry: nvcr.io
51
+
52
+ code:
53
+ local_dir: $CONFIG_DIR/../
54
+
55
+
56
+
57
+ jobs:
58
+ - name: gpt2-large
59
+ preemptible: True
60
+ sku: ${NUM_NODES}xG${NUM_GPUS}-V100-IB
61
+ process_count_per_node: 1 # Each node should run 1 process
62
+ command:
63
+ - . amlt_configs/setup.sh
64
+ - source ~/.bashrc
65
+ - . amlt_configs/setup_accelerate_on_azure.sh
66
+ - >-
67
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
68
+ $SHARED_CMD_ARGS
69
+ train_data='[vg-densecap-region_descriptions]'
70
+ eval_data='[vg-densecap-region_descriptions]'
71
+ model.lm_head_model_name_or_path=gpt2-large
72
+ $EXTRA_ARGS
73
+
74
+
75
+ submit_args:
76
+ env:
77
+ SHARED_MEMORY_PERCENT: 0.5
78
+ HYDRA_FULL_ERROR: 1
79
+ # NCCL_IB_DISABLE: 1
80
+ # NCCL_IBEXT_DISABLE: 1
81
+ container_args:
82
+ shm_size: 256g
83
+
84
+ - name: open_llama_3b_v2
85
+ preemptible: True
86
+ sku: ${NUM_NODES}xG${NUM_GPUS}-V100-IB
87
+ process_count_per_node: 1 # Each node should run 1 process
88
+ command:
89
+ - . amlt_configs/setup.sh
90
+ - source ~/.bashrc
91
+ - . amlt_configs/setup_accelerate_on_azure.sh
92
+ - >-
93
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
94
+ $SHARED_CMD_ARGS
95
+ train_data='[vg-densecap-region_descriptions]'
96
+ eval_data='[vg-densecap-region_descriptions]'
97
+ model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
98
+ training.gradient_checkpointing=true
99
+ $EXTRA_ARGS
100
+
101
+
102
+ submit_args:
103
+ env:
104
+ SHARED_MEMORY_PERCENT: 0.5
105
+ HYDRA_FULL_ERROR: 1
106
+ # NCCL_IB_DISABLE: 1
107
+ # NCCL_IBEXT_DISABLE: 1
108
+ container_args:
109
+ shm_size: 256g
110
+
111
+
112
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=1e-4" \
113
+ # -t msroctovc -w msroctows --sku=G8-V100 --no-pre \
114
+ # amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_4 train-sca-ablat-finetune-scale_lr-110423
115
+
116
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=1e-5" \
117
+ # -t msroctovc -w msroctows --sku=G8-V100 --no-pre \
118
+ # amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_5 train-sca-ablat-finetune-scale_lr-110423
119
+
120
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-5" \
121
+ # -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
122
+ # amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_5-1xlr train-sca-ablat-finetune-scale_lr-110423
123
+
124
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4" \
125
+ # -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
126
+ # amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_4-1xlr train-sca-ablat-finetune-scale_lr-110423
amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+
4
+ # NOTE: this kind of string leaded by > will append a new line to the end of the string
5
+ SHARED_CMD_ARGS: >-
6
+ -m src.train
7
+ +model=base_sca_multitask_v2
8
+ training.do_train=True
9
+ training.do_eval=True
10
+ training.do_inference=True
11
+ +data.streaming=False
12
+ training.max_eval_samples=800
13
+ training.max_steps=200000
14
+ training.fp16=True
15
+ training.output_dir=$AMLT_OUTPUT_DIR
16
+ training.output_log_dir=$AMLT_LOGS_DIR
17
+ model.cache_dir=/mnt/blob/weights/.model.cache/
18
+ training.save_strategy=steps
19
+ training.save_steps=5000
20
+ training.save_total_limit=3
21
+ training.optim=adamw_torch
22
+ training.evaluate_before_train=True
23
+ training.per_device_train_batch_size=1
24
+ training.evaluation_strategy=steps
25
+ training.eval_steps=5000
26
+ training.logging_steps=1000
27
+ training.logging_first_step=True
28
+ training.dataloader_num_workers=4
29
+ training.num_masks_per_sample=16
30
+ wandb.project=$AMLT_EXPERIMENT_NAME
31
+ wandb.name=$AMLT_JOB_NAME
32
+ model.num_caption_tokens=8
33
+ model.additional_num_hidden_layers=12
34
+ model.num_task_tokens=6
35
+ training.lr_scheduler_type=cosine
36
+ model.lm_head_model_name_or_path=gpt2-large
37
+ training.learning_rate=1e-4
38
+ training.weight_decay=1e-4
39
+ training.warmup_steps=200
40
+ training.warmup_ratio=0.33333333
41
+ training.compute_metrics=True
42
+
43
+
44
+
45
+ environment:
46
+
47
+ image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
48
+ # image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
49
+ # image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
50
+ registry: nvcr.io
51
+
52
+ code:
53
+ local_dir: $CONFIG_DIR/../
54
+
55
+
56
+
57
+ jobs:
58
+ - name: gpt2-large
59
+ preemptible: True
60
+ sku: ${NUM_NODES}xG${NUM_GPUS}
61
+ process_count_per_node: 1 # Each node should run 1 process
62
+ command:
63
+ - . amlt_configs/setup.sh
64
+ - source ~/.bashrc
65
+ - . amlt_configs/setup_accelerate_on_azure.sh
66
+ - >-
67
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
68
+ $SHARED_CMD_ARGS
69
+ train_data='[vg-densecap-local]'
70
+ eval_data='[vg-densecap-local]'
71
+ model.lm_head_model_name_or_path=gpt2-large
72
+ $EXTRA_ARGS
73
+
74
+ submit_args:
75
+ env:
76
+ SHARED_MEMORY_PERCENT: 0.5
77
+ HYDRA_FULL_ERROR: 1
78
+ container_args:
79
+ shm_size: 256g
80
+
81
+ - name: open_llama_3b_v2
82
+ preemptible: True
83
+ sku: ${NUM_NODES}xG${NUM_GPUS}
84
+ process_count_per_node: 1 # Each node should run 1 process
85
+ command:
86
+ - . amlt_configs/setup.sh
87
+ - source ~/.bashrc
88
+ - . amlt_configs/setup_accelerate_on_azure.sh
89
+ - >-
90
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
91
+ $SHARED_CMD_ARGS
92
+ train_data='[vg-densecap-local]'
93
+ eval_data='[vg-densecap-local]'
94
+ model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
95
+ training.gradient_checkpointing=true
96
+ $EXTRA_ARGS
97
+
98
+ submit_args:
99
+ env:
100
+ SHARED_MEMORY_PERCENT: 0.5
101
+ HYDRA_FULL_ERROR: 1
102
+ container_args:
103
+ shm_size: 256g
104
+
105
+
106
+ # sing resrch 1x8 no-pre lsj
107
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0" -t msrresrchvc -w msrresrchws --sku=G8-V100 --no-pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.resrch-1x8-v100-16g-no_pre.ollm3bv2-large-lsj train-sca-ablat-lsj-scale_lr-110423
108
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0" -t msrresrchvc -w msrresrchws --sku=G8-V100 --no-pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-1x8-v100-16g-no_pre.gpt2-large-lsj train-sca-ablat-lsj-scale_lr-110423
109
+
110
+ # sing octo 4x8 no-pre lsj
111
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr train-sca-ablat-lsj-scale_lr-110423
112
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr train-sca-ablat-lsj-scale_lr-110423
113
+
114
+ # The maximum scale lr with BS 64: 8e-4 (too big to achieve better)
115
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=8e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.resrch-16x4-v100-16g-pre.ollm3bv2-large-lsj-1xlr train-sca-ablat-lsj-scale_lr-110423
116
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=8e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-16x4-v100-16g-no_pre.gpt2-large-lsj-1xlr train-sca-ablat-lsj-scale_lr-110423
117
+
118
+ # The maximum scale lr with BS 64: 4e-4 (try to achieve better with that from BS 32)
119
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.resrch-16x4-v100-16g-pre.ollm3bv2-large-lsj-1xlr-4e_4 train-sca-ablat-lsj-scale_lr-110423
120
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-16x4-v100-16g-no_pre.gpt2-large-lsj-1xlr-4e_4 train-sca-ablat-lsj-scale_lr-110423
121
+
122
+ # 1x8, 4e-4
123
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t itplabrr1cl1 -w resrchvc --sku=G8-V100 --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.rr1-1x8-v100-16g-pre.ollm3bv2-large-lsj-4e_4 train-sca-ablat-lsj-scale_lr-110423
124
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t itplabrr1cl1 -w resrchvc --sku=G8-V100 --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.rr1-1x8-v100-16g-pre.gpt2-large-lsj-4e_4 train-sca-ablat-lsj-scale_lr-110423
125
+
126
+ # The maximum scale lr with BS 64: 4e-4 (try to achieve better with that from BS 32)
127
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.resrch-16x4-v100-16g-pre.ollm3bv2-large-lsj-1xlr-4e_4 train-sca-ablat-lsj-scale_lr-110423
128
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-16x4-v100-16g-no_pre.gpt2-large-lsj-1xlr-4e_4 train-sca-ablat-lsj-scale_lr-110423
amlt_configs/train-sca-ablat-model_arch-103123.yaml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+
4
+ # NOTE: this kind of string leaded by > will append a new line to the end of the string
5
+ SHARED_CMD_ARGS: >-
6
+ -m src.train
7
+ training.do_train=True
8
+ training.do_eval=True
9
+ training.do_inference=True
10
+ training.max_eval_samples=800
11
+ training.max_steps=200000
12
+ training.fp16=True
13
+ training.output_dir=$AMLT_OUTPUT_DIR
14
+ training.output_log_dir=$AMLT_LOGS_DIR
15
+ training.save_strategy=steps
16
+ training.save_steps=5000
17
+ training.save_total_limit=3
18
+ training.optim=adamw_torch
19
+ training.evaluate_before_train=True
20
+ training.per_device_train_batch_size=1
21
+ training.evaluation_strategy=steps
22
+ training.eval_steps=5000
23
+ training.logging_steps=1000
24
+ training.logging_first_step=True
25
+ training.dataloader_num_workers=4
26
+ training.num_masks_per_sample=16
27
+ training.lr_scheduler_type=cosine
28
+ training.learning_rate=1e-4
29
+ training.weight_decay=1e-4
30
+ training.warmup_steps=200
31
+ training.warmup_ratio=0.33333333
32
+ training.compute_metrics=True
33
+ wandb.project=$AMLT_EXPERIMENT_NAME
34
+ wandb.name=$AMLT_JOB_NAME
35
+ model.cache_dir=/mnt/blob/weights/.model.cache/
36
+ model.num_task_tokens=6
37
+ model.lm_head_model_name_or_path=gpt2-large
38
+
39
+
40
+
41
+ environment:
42
+
43
+ image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
44
+ # image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
45
+ # image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
46
+ registry: nvcr.io
47
+
48
+ code:
49
+ local_dir: $CONFIG_DIR/../
50
+
51
+
52
+
53
+ jobs:
54
+ - name: gpt2-large
55
+ preemptible: True
56
+ sku: ${NUM_NODES}xG${NUM_GPUS}
57
+ process_count_per_node: 1 # Each node should run 1 process
58
+ command:
59
+ - . amlt_configs/setup.sh
60
+ - source ~/.bashrc
61
+ - . amlt_configs/setup_accelerate_on_azure.sh
62
+ - >-
63
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
64
+ $SHARED_CMD_ARGS
65
+ train_data='[vg-densecap-local]'
66
+ eval_data='[vg-densecap-local]'
67
+ model.lm_head_model_name_or_path=gpt2-large
68
+ $EXTRA_ARGS
69
+
70
+ submit_args:
71
+ env:
72
+ SHARED_MEMORY_PERCENT: 0.5
73
+ HYDRA_FULL_ERROR: 1
74
+ container_args:
75
+ shm_size: 256g
76
+
77
+ - name: open_llama_3b_v2
78
+ preemptible: True
79
+ sku: ${NUM_NODES}xG${NUM_GPUS}
80
+ process_count_per_node: 1 # Each node should run 1 process
81
+ command:
82
+ - . amlt_configs/setup.sh
83
+ - source ~/.bashrc
84
+ - . amlt_configs/setup_accelerate_on_azure.sh
85
+ - >-
86
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
87
+ $SHARED_CMD_ARGS
88
+ train_data='[vg-densecap-local]'
89
+ eval_data='[vg-densecap-local]'
90
+ model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
91
+ training.gradient_checkpointing=true
92
+ $EXTRA_ARGS
93
+
94
+ submit_args:
95
+ env:
96
+ SHARED_MEMORY_PERCENT: 0.5
97
+ HYDRA_FULL_ERROR: 1
98
+ container_args:
99
+ shm_size: 256g
100
+
101
+
102
+ # sing resrch 1x8
103
+ # amlt run amlt_configs/train-sca-ablat-model_arch-103123.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.gpt2-large-mtv2 train-sca-ablat-model_arch-103123 -d "" --extra-args "+model=base_sca_multitask_v2 model.num_caption_tokens=8 model.additional_num_hidden_layers=12" -t msroctovc -w msroctows --sku=G8-V100 --no-pre
104
+ # amlt run amlt_configs/train-sca-ablat-model_arch-103123.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.gpt2-large-sm train-sca-ablat-model_arch-103123 -d "" --extra-args "+model=base_sca_multitask_split_mixer model.num_caption_tokens=8 model.additional_num_hidden_layers=12" -t msroctovc -w msroctows --sku=G8-V100 --no-pre
105
+ # sing octo 1x8 v100 16g
106
+ # amlt run amlt_configs/train-sca-ablat-model_arch-103123.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.gpt2-large-ddv2 train-sca-ablat-model_arch-103123 -d "" --extra-args "+model=base_sca_direct_decoding_v2 model.additional_num_hidden_layers=12" -t msroctovc -w msroctows --sku=G8-V100 --no-pre
107
+ # amlt run -d "" --extra-args "+model=base_sca_multitask_roi_pool" -t msrresrchvc -w msrresrchws --sku=G8-V100 --no-pre amlt_configs/train-sca-ablat-model_arch-103123.yaml :0=`date +"%m%d%y"`.resrch-1x8-v100-16g-no_pre.gpt2-large-roi_pool train-sca-ablat-model_arch-103123
108
+ # amlt run -d "" --extra-args "+model=base_sca_multitask_roi_pool model.vl_projector_type=mlp" -t msrresrchvc -w msrresrchws --sku=G8-V100 --no-pre amlt_configs/train-sca-ablat-model_arch-103123.yaml :0=`date +"%m%d%y"`.resrch-1x8-v100-16g-no_pre.gpt2-large-roi_pool-mlp train-sca-ablat-model_arch-103123
109
+
110
+ # Caveat:
111
+ # 1. cannot add two "+model"
112
+ # 2. base_sca_direct_decoding_v2 cannot add `num_caption_tokens`
amlt_configs/train-sca-ablat-sam_size-110423.yaml ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+
4
+ # NOTE: this kind of string leaded by > will append a new line to the end of the string
5
+ SHARED_CMD_ARGS: >-
6
+ -m src.train
7
+ +model=base_sca_multitask_v2
8
+ training.do_train=True
9
+ training.do_eval=True
10
+ training.do_inference=True
11
+ training.max_eval_samples=800
12
+ training.max_steps=200000
13
+ training.fp16=True
14
+ training.output_dir=$AMLT_OUTPUT_DIR
15
+ training.output_log_dir=$AMLT_LOGS_DIR
16
+ training.save_strategy=steps
17
+ training.save_steps=5000
18
+ training.save_total_limit=3
19
+ training.optim=adamw_torch
20
+ training.evaluate_before_train=True
21
+ training.per_device_train_batch_size=1
22
+ training.evaluation_strategy=steps
23
+ training.eval_steps=5000
24
+ training.logging_steps=1000
25
+ training.logging_first_step=True
26
+ training.dataloader_num_workers=4
27
+ training.num_masks_per_sample=16
28
+ training.lr_scheduler_type=cosine
29
+ training.learning_rate=1e-4
30
+ training.weight_decay=1e-4
31
+ training.warmup_steps=200
32
+ training.warmup_ratio=0.33333333
33
+ training.compute_metrics=True
34
+ wandb.project=$AMLT_EXPERIMENT_NAME
35
+ wandb.name=$AMLT_JOB_NAME
36
+ model.cache_dir=/mnt/blob/weights/.model.cache/
37
+ model.additional_num_hidden_layers=12
38
+ model.num_task_tokens=6
39
+ model.lm_head_model_name_or_path=gpt2-large
40
+ model.num_caption_tokens=8
41
+
42
+
43
+
44
+ environment:
45
+
46
+ image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
47
+ # image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
48
+ # image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
49
+ registry: nvcr.io
50
+
51
+ code:
52
+ local_dir: $CONFIG_DIR/../
53
+
54
+
55
+
56
+ jobs:
57
+ - name: gpt2-large
58
+ preemptible: True
59
+ sku: ${NUM_NODES}xG${NUM_GPUS}
60
+ process_count_per_node: 1 # Each node should run 1 process
61
+ command:
62
+ - . amlt_configs/setup.sh
63
+ - source ~/.bashrc
64
+ - . amlt_configs/setup_accelerate_on_azure.sh
65
+ - >-
66
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
67
+ $SHARED_CMD_ARGS
68
+ train_data='[vg-densecap-local]'
69
+ eval_data='[vg-densecap-local]'
70
+ model.lm_head_model_name_or_path=gpt2-large
71
+ $EXTRA_ARGS
72
+
73
+ submit_args:
74
+ env:
75
+ SHARED_MEMORY_PERCENT: 0.5
76
+ HYDRA_FULL_ERROR: 1
77
+ container_args:
78
+ shm_size: 256g
79
+
80
+ - name: open_llama_3b_v2
81
+ preemptible: True
82
+ sku: ${NUM_NODES}xG${NUM_GPUS}
83
+ process_count_per_node: 1 # Each node should run 1 process
84
+ command:
85
+ - . amlt_configs/setup.sh
86
+ - source ~/.bashrc
87
+ - . amlt_configs/setup_accelerate_on_azure.sh
88
+ - >-
89
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
90
+ $SHARED_CMD_ARGS
91
+ train_data='[vg-densecap-local]'
92
+ eval_data='[vg-densecap-local]'
93
+ model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
94
+ training.gradient_checkpointing=true
95
+ $EXTRA_ARGS
96
+
97
+ submit_args:
98
+ env:
99
+ SHARED_MEMORY_PERCENT: 0.5
100
+ HYDRA_FULL_ERROR: 1
101
+ container_args:
102
+ shm_size: 256g
103
+
104
+
105
+ # sing resrch 1x8
106
+ # amlt run amlt_configs/train-sca-ablat-sam_size-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.sam-vit-huge train-sca-ablat-sam_size-110423 -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-huge" -t msroctovc -w msroctows --sku=G8-V100 --no-pre -y
107
+ # amlt run amlt_configs/train-sca-ablat-sam_size-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.sam-vit-large train-sca-ablat-sam_size-110423 -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-large" -t msroctovc -w msroctows --sku=G8-V100 --no-pre -y
108
+ # amlt run amlt_configs/train-sca-ablat-sam_size-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.sam-vit-base train-sca-ablat-sam_size-110423 -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-base" -t msroctovc -w msroctows --sku=G8-V100 --no-pre -y
amlt_configs/train-sca-ablat-timm.yaml ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+
4
+ # NOTE: this kind of string leaded by > will append a new line to the end of the string
5
+ # Use base_sca_multitask_v2
6
+ # training.lr_scheduler_type=constant
7
+ SHARED_CMD_ARGS: >-
8
+ -m src.train
9
+ +model=base_sca_timm_multitask_v2
10
+ training.do_train=True
11
+ training.do_eval=True
12
+ training.do_inference=True
13
+ +data.streaming=False
14
+ training.max_eval_samples=800
15
+ training.max_steps=200000
16
+ training.fp16=True
17
+ training.output_dir=$AMLT_OUTPUT_DIR
18
+ training.output_log_dir=$AMLT_LOGS_DIR
19
+ model.cache_dir=/mnt/blob/weights/.model.cache/
20
+ training.save_strategy=steps
21
+ training.save_steps=5000
22
+ training.save_total_limit=3
23
+ training.optim=adamw_torch
24
+ training.evaluate_before_train=True
25
+ training.per_device_train_batch_size=1
26
+ training.evaluation_strategy=steps
27
+ training.eval_steps=5000
28
+ training.logging_steps=1000
29
+ training.logging_first_step=True
30
+ training.dataloader_num_workers=4
31
+ training.num_masks_per_sample=16
32
+ wandb.project=$AMLT_EXPERIMENT_NAME
33
+ wandb.name=$AMLT_JOB_NAME
34
+ model.num_caption_tokens=8
35
+ model.additional_num_hidden_layers=12
36
+ model.num_task_tokens=6
37
+ training.lr_scheduler_type=cosine
38
+ model.lm_head_model_name_or_path=gpt2-large
39
+ training.learning_rate=1e-4
40
+ training.weight_decay=1e-4
41
+ training.warmup_steps=200
42
+ training.warmup_ratio=0.33333333
43
+ training.compute_metrics=True
44
+
45
+
46
+
47
+ environment:
48
+
49
+ image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
50
+ # image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
51
+ # image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
52
+ registry: nvcr.io
53
+
54
+ code:
55
+ local_dir: $CONFIG_DIR/../
56
+
57
+
58
+
59
+ jobs:
60
+ - name: vit_base_patch32_clip_224.openai
61
+ preemptible: True
62
+ sku: ${NUM_NODES}xG${NUM_GPUS}
63
+ process_count_per_node: 1 # Each node should run 1 process
64
+ command:
65
+ - . amlt_configs/setup.sh
66
+ - source ~/.bashrc
67
+ - . amlt_configs/setup_accelerate_on_azure.sh
68
+ - >-
69
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
70
+ $SHARED_CMD_ARGS
71
+ train_data='[vg-densecap-local]'
72
+ eval_data='[vg-densecap-local]'
73
+ model.sam_model_name_or_path=facebook/sam-vit-base
74
+ model.timm_vision_name=vit_base_patch32_clip_224.openai
75
+ $EXTRA_ARGS
76
+ submit_args:
77
+ env:
78
+ SHARED_MEMORY_PERCENT: 0.5
79
+ HYDRA_FULL_ERROR: 1
80
+ # NCCL_IB_DISABLE: 1
81
+ # NCCL_IBEXT_DISABLE: 1
82
+ container_args:
83
+ shm_size: 256g
84
+
85
+ # TIMM_NAME_LS=(
86
+ # vit_large_patch14_clip_336.openai
87
+ # vit_large_patch14_clip_224.datacompxl
88
+ # eva02_large_patch14_clip_336.merged2b
89
+ # )
90
+ # for TIMM_NAME in ${TIMM_NAME_LS[@]}; do
91
+ # amlt run -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-$(echo $TIMM_NAME | cut -d '_' -f2) model.timm_vision_name=$TIMM_NAME" \
92
+ # -t itplabrr1cl1 -w resrchvc --no-pre \
93
+ # amlt_configs/train-sca-ablat-timm.yaml \
94
+ # :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.$TIMM_NAME \
95
+ # 112323.train-sca-ablat-timm
96
+ # done
97
+
98
+ # TIMM_NAME_LS=(
99
+ # vit_large_patch14_clip_336.openai
100
+ # vit_large_patch14_clip_224.datacompxl
101
+ # eva02_large_patch14_clip_336.merged2b
102
+ # )
103
+ # for TIMM_NAME in ${TIMM_NAME_LS[@]}; do
104
+ # amlt run -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-$(echo $TIMM_NAME | cut -d '_' -f2) model.timm_vision_name=$TIMM_NAME training.trainable_params='[mask_decoder.additional_transformer,mask_decoder.caption_tokens,task_tokens,language_project,neck,mask_decoder.transformer]'" \
105
+ # -t itplabrr1cl1 -w resrchvc --no-pre \
106
+ # amlt_configs/train-sca-ablat-timm.yaml \
107
+ # :0=`date +"%m%d%y"`.rr1-1x8-v100-16g-no_pre.$TIMM_NAME-tune_sam_xformer \
108
+ # 112323.train-sca-ablat-timm
109
+ # done
110
+
111
+ # TIMM_NAME_LS=(
112
+ # vit_large_patch16_224.mae
113
+ # vit_large_patch14_reg4_dinov2.lvd142m
114
+ # )
115
+ # for TIMM_NAME in ${TIMM_NAME_LS[@]}; do
116
+ # amlt run -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-$(echo $TIMM_NAME | cut -d '_' -f2) model.timm_vision_name=$TIMM_NAME training.trainable_params='[mask_decoder.additional_transformer,mask_decoder.caption_tokens,task_tokens,language_project,neck,mask_decoder.transformer]'" \
117
+ # -t itplabrr1cl1 -w resrchvc --no-pre \
118
+ # amlt_configs/train-sca-ablat-timm.yaml \
119
+ # :0=`date +"%m%d%y"`.rr1-1x8-v100-16g-no_pre.$TIMM_NAME-tune_sam_xformer \
120
+ # 112323.train-sca-ablat-timm
121
+ # done
122
+
123
+ # TIMM_NAME_LS=(
124
+ # vit_large_patch14_reg4_dinov2.lvd142m
125
+ # )
126
+ # for TIMM_NAME in ${TIMM_NAME_LS[@]}; do
127
+ # amlt run -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-$(echo $TIMM_NAME | cut -d '_' -f2) model.timm_vision_name=$TIMM_NAME training.trainable_params='[mask_decoder.additional_transformer,mask_decoder.caption_tokens,task_tokens,language_project,neck,mask_decoder.transformer]'" \
128
+ # -t msroctovc -w msroctows --no-pre \
129
+ # amlt_configs/train-sca-ablat-timm.yaml \
130
+ # :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.$TIMM_NAME-tune_sam_xformer \
131
+ # 112323.train-sca-ablat-timm
132
+ # done
133
+
134
+ # TIMM_NAME_LS=(
135
+ # eva02_large_patch14_clip_336.merged2b
136
+ # )
137
+ # for TIMM_NAME in ${TIMM_NAME_LS[@]}; do
138
+ # amlt run -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-$(echo $TIMM_NAME | cut -d '_' -f2) model.timm_vision_name=$TIMM_NAME" \
139
+ # -t msroctovc -w msroctows --no-pre \
140
+ # amlt_configs/train-sca-ablat-timm.yaml \
141
+ # :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.$TIMM_NAME \
142
+ # 112323.train-sca-ablat-timm
143
+ # done
amlt_configs/train-sca-ablat-weak_sup_data.yaml ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+
4
+ # NOTE: this kind of string leaded by > will append a new line to the end of the string
5
+ # Use base_sca_multitask_v2
6
+ # training.lr_scheduler_type=constant
7
+ SHARED_CMD_ARGS: >-
8
+ -m src.train
9
+ +model=base_sca_multitask_v2
10
+ training.do_train=True
11
+ training.do_eval=True
12
+ training.do_inference=True
13
+ +data.streaming=False
14
+ training.max_eval_samples=800
15
+ training.max_steps=200000
16
+ training.fp16=True
17
+ training.output_dir=$AMLT_OUTPUT_DIR
18
+ training.output_log_dir=$AMLT_LOGS_DIR
19
+ model.cache_dir=/mnt/blob/weights/.model.cache/
20
+ training.save_strategy=steps
21
+ training.save_steps=5000
22
+ training.save_total_limit=3
23
+ training.optim=adamw_torch
24
+ training.evaluate_before_train=True
25
+ training.per_device_train_batch_size=1
26
+ training.evaluation_strategy=steps
27
+ training.eval_steps=5000
28
+ training.logging_steps=1000
29
+ training.logging_first_step=True
30
+ training.dataloader_num_workers=4
31
+ training.num_masks_per_sample=16
32
+ wandb.project=$AMLT_EXPERIMENT_NAME
33
+ wandb.name=$AMLT_JOB_NAME
34
+ model.num_caption_tokens=8
35
+ model.additional_num_hidden_layers=12
36
+ model.num_task_tokens=6
37
+ training.lr_scheduler_type=cosine
38
+ model.lm_head_model_name_or_path=gpt2-large
39
+ training.learning_rate=1e-4
40
+ training.weight_decay=1e-4
41
+ training.warmup_steps=200
42
+ training.warmup_ratio=0.33333333
43
+ training.compute_metrics=True
44
+
45
+
46
+
47
+ environment:
48
+
49
+ image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
50
+ # image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
51
+ # image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
52
+ registry: nvcr.io
53
+
54
+ code:
55
+ local_dir: $CONFIG_DIR/../
56
+
57
+
58
+
59
+ jobs:
60
+ - name: only-vg
61
+ preemptible: True
62
+ sku: ${NUM_NODES}xG${NUM_GPUS}
63
+ process_count_per_node: 1 # Each node should run 1 process
64
+ command:
65
+ - . amlt_configs/setup.sh
66
+ - source ~/.bashrc
67
+ - . amlt_configs/setup_accelerate_on_azure.sh
68
+ - >-
69
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
70
+ $SHARED_CMD_ARGS
71
+ train_data='[vg-densecap-local]'
72
+ eval_data='[vg-densecap-local]'
73
+ training.max_steps=100000
74
+ training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
75
+ wandb.name=$$AMLT_JOB_NAME-vg
76
+ $EXTRA_ARGS
77
+ submit_args:
78
+ env:
79
+ SHARED_MEMORY_PERCENT: 0.5
80
+ HYDRA_FULL_ERROR: 1
81
+ # NCCL_IB_DISABLE: 1
82
+ # NCCL_IBEXT_DISABLE: 1
83
+ container_args:
84
+ shm_size: 256g
85
+
86
+ - name: first-coco-then-vg
87
+ preemptible: True
88
+ sku: ${NUM_NODES}xG${NUM_GPUS}
89
+ process_count_per_node: 1 # Each node should run 1 process
90
+ command:
91
+ - . amlt_configs/setup.sh
92
+ - source ~/.bashrc
93
+ - . amlt_configs/setup_accelerate_on_azure.sh
94
+ - >-
95
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
96
+ $SHARED_CMD_ARGS
97
+ train_data='[coco-instance-task_type_caption-local]'
98
+ eval_data='[coco-instance-task_type_caption-local]'
99
+ training.max_steps=100000
100
+ training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
101
+ wandb.name=$$AMLT_JOB_NAME-coco
102
+ $EXTRA_ARGS
103
+ - >-
104
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
105
+ $SHARED_CMD_ARGS
106
+ train_data='[vg-densecap-local]'
107
+ eval_data='[vg-densecap-local]'
108
+ training.max_steps=100000
109
+ training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
110
+ wandb.name=$$AMLT_JOB_NAME-vg
111
+ model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
112
+ $EXTRA_ARGS
113
+ submit_args:
114
+ env:
115
+ SHARED_MEMORY_PERCENT: 0.5
116
+ HYDRA_FULL_ERROR: 1
117
+ # NCCL_IB_DISABLE: 1
118
+ # NCCL_IBEXT_DISABLE: 1
119
+ container_args:
120
+ shm_size: 256g
121
+
122
+ - name: first-v3det-task_type_caption-local-then-vg
123
+ preemptible: True
124
+ sku: ${NUM_NODES}xG${NUM_GPUS}
125
+ process_count_per_node: 1 # Each node should run 1 process
126
+ command:
127
+ - . amlt_configs/setup.sh
128
+ - source ~/.bashrc
129
+ - . amlt_configs/setup_accelerate_on_azure.sh
130
+ - >-
131
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
132
+ $SHARED_CMD_ARGS
133
+ $EXTRA_ARGS
134
+ train_data='[v3det-task_type_caption-local]'
135
+ eval_data='[coco-instance-task_type_caption-local]'
136
+ training.max_steps=100000
137
+ training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
138
+ wandb.name=$$AMLT_JOB_NAME-v3det
139
+ - >-
140
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
141
+ $SHARED_CMD_ARGS
142
+ train_data='[vg-densecap-local]'
143
+ eval_data='[vg-densecap-local]'
144
+ training.max_steps=100000
145
+ training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
146
+ wandb.name=$$AMLT_JOB_NAME-vg
147
+ model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
148
+ $EXTRA_ARGS
149
+ submit_args:
150
+ env:
151
+ SHARED_MEMORY_PERCENT: 0.5
152
+ HYDRA_FULL_ERROR: 1
153
+ # NCCL_IB_DISABLE: 1
154
+ # NCCL_IBEXT_DISABLE: 1
155
+ container_args:
156
+ shm_size: 256g
157
+
158
+ - name: first-objects365-then-vg
159
+ preemptible: True
160
+ sku: ${NUM_NODES}xG${NUM_GPUS}
161
+ process_count_per_node: 1 # Each node should run 1 process
162
+ command:
163
+ - . amlt_configs/setup.sh
164
+ - source ~/.bashrc
165
+ - . amlt_configs/setup_accelerate_on_azure.sh
166
+ - >-
167
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
168
+ $SHARED_CMD_ARGS
169
+ train_data='[objects365-task_type_caption-local]'
170
+ eval_data='[coco-instance-task_type_caption-local]'
171
+ training.max_steps=100000
172
+ training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
173
+ wandb.name=$$AMLT_JOB_NAME-objects365
174
+ $EXTRA_ARGS
175
+ - >-
176
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
177
+ $SHARED_CMD_ARGS
178
+ train_data='[vg-densecap-local]'
179
+ eval_data='[vg-densecap-local]'
180
+ training.max_steps=100000
181
+ training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
182
+ wandb.name=$$AMLT_JOB_NAME-vg
183
+ model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
184
+ $EXTRA_ARGS
185
+ submit_args:
186
+ env:
187
+ SHARED_MEMORY_PERCENT: 0.5
188
+ HYDRA_FULL_ERROR: 1
189
+ # NCCL_IB_DISABLE: 1
190
+ # NCCL_IBEXT_DISABLE: 1
191
+ container_args:
192
+ shm_size: 256g
193
+
194
+
195
+ - name: first-coco-v3det-task_type_caption-local-then-vg
196
+ preemptible: True
197
+ sku: ${NUM_NODES}xG${NUM_GPUS}
198
+ process_count_per_node: 1 # Each node should run 1 process
199
+ command:
200
+ - . amlt_configs/setup.sh
201
+ - source ~/.bashrc
202
+ - . amlt_configs/setup_accelerate_on_azure.sh
203
+ - >-
204
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
205
+ $SHARED_CMD_ARGS
206
+ $EXTRA_ARGS
207
+ train_data='[coco-instance-task_type_caption-local,v3det-task_type_caption-local]'
208
+ train_data_interleave_probabilities='[117266,183348]'
209
+ eval_data='[coco-instance-task_type_caption-local]'
210
+ training.max_steps=100000
211
+ training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
212
+ wandb.name=$$AMLT_JOB_NAME-v3det
213
+ - >-
214
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
215
+ $SHARED_CMD_ARGS
216
+ train_data='[vg-densecap-local]'
217
+ eval_data='[vg-densecap-local]'
218
+ training.max_steps=100000
219
+ training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
220
+ wandb.name=$$AMLT_JOB_NAME-vg
221
+ model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
222
+ $EXTRA_ARGS
223
+ submit_args:
224
+ env:
225
+ SHARED_MEMORY_PERCENT: 0.5
226
+ HYDRA_FULL_ERROR: 1
227
+ # NCCL_IB_DISABLE: 1
228
+ # NCCL_IBEXT_DISABLE: 1
229
+ container_args:
230
+ shm_size: 256g
231
+
232
+ - name: first-coco-v3det-objects365-then-vg
233
+ preemptible: True
234
+ sku: ${NUM_NODES}xG${NUM_GPUS}
235
+ process_count_per_node: 1 # Each node should run 1 process
236
+ command:
237
+ - . amlt_configs/setup.sh
238
+ - source ~/.bashrc
239
+ - . amlt_configs/setup_accelerate_on_azure.sh
240
+ - >-
241
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
242
+ $SHARED_CMD_ARGS
243
+ $EXTRA_ARGS
244
+ train_data='[coco-instance-task_type_caption-local,v3det-task_type_caption-local,objects365-task_type_caption-local]'
245
+ train_data_interleave_probabilities='[117266,183348,1742289]'
246
+ eval_data='[coco-instance-task_type_caption-local]'
247
+ training.max_steps=100000
248
+ training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
249
+ wandb.name=$$AMLT_JOB_NAME-v3det
250
+ - >-
251
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
252
+ $SHARED_CMD_ARGS
253
+ train_data='[vg-densecap-local]'
254
+ eval_data='[vg-densecap-local]'
255
+ training.max_steps=100000
256
+ training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
257
+ wandb.name=$$AMLT_JOB_NAME-vg
258
+ model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
259
+ $EXTRA_ARGS
260
+ submit_args:
261
+ env:
262
+ SHARED_MEMORY_PERCENT: 0.5
263
+ HYDRA_FULL_ERROR: 1
264
+ # NCCL_IB_DISABLE: 1
265
+ # NCCL_IBEXT_DISABLE: 1
266
+ container_args:
267
+ shm_size: 256g
268
+
269
+ - name: first-coco-objects365-then-vg
270
+ preemptible: True
271
+ sku: ${NUM_NODES}xG${NUM_GPUS}
272
+ process_count_per_node: 1 # Each node should run 1 process
273
+ command:
274
+ - . amlt_configs/setup.sh
275
+ - source ~/.bashrc
276
+ - . amlt_configs/setup_accelerate_on_azure.sh
277
+ - >-
278
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
279
+ $SHARED_CMD_ARGS
280
+ $EXTRA_ARGS
281
+ train_data='[coco-instance-task_type_caption-local,objects365-task_type_caption-local]'
282
+ train_data_interleave_probabilities='[117266,1742289]'
283
+ eval_data='[coco-instance-task_type_caption-local]'
284
+ training.max_steps=100000
285
+ training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
286
+ wandb.name=$$AMLT_JOB_NAME-v3det
287
+ - >-
288
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
289
+ $SHARED_CMD_ARGS
290
+ train_data='[vg-densecap-local]'
291
+ eval_data='[vg-densecap-local]'
292
+ training.max_steps=100000
293
+ training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
294
+ wandb.name=$$AMLT_JOB_NAME-vg
295
+ model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
296
+ $EXTRA_ARGS
297
+ submit_args:
298
+ env:
299
+ SHARED_MEMORY_PERCENT: 0.5
300
+ HYDRA_FULL_ERROR: 1
301
+ # NCCL_IB_DISABLE: 1
302
+ # NCCL_IBEXT_DISABLE: 1
303
+ container_args:
304
+ shm_size: 256g
305
+
306
+
307
+ # sing clusters, both octo and resrch failed
308
+ # amlt run -d "" \
309
+ # -t msroctovc -w msroctows --no-pre \
310
+ # amlt_configs/train-sca-ablat-weak_sup_data.yaml \
311
+ # 112123.train-sca-ablat-weak_sup_data.octo
312
+
313
+ # sing clusters, both octo and resrch failed
314
+ # amlt run -d "" \
315
+ # -t msrresrchvc -w msrresrchws --no-pre \
316
+ # amlt_configs/train-sca-ablat-weak_sup_data.yaml \
317
+ # 112123.train-sca-ablat-weak_sup_data.resrch
318
+
319
+ # amlt run -d "" \
320
+ # -t itplabrr1cl1 -w resrchvc --no-pre \
321
+ # amlt_configs/train-sca-ablat-weak_sup_data.yaml \
322
+ # 112123.train-sca-ablat-weak_sup_data.rr1
323
+
324
+ # amlt run -d "" \
325
+ # -t msroctovc -w msroctows --no-pre \
326
+ # amlt_configs/train-sca-ablat-weak_sup_data.yaml :first-coco-objects365-then-vg \
327
+ # 112123.train-sca-ablat-weak_sup_data.rr1
amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+
4
+ # NOTE: this kind of string leaded by > will append a new line to the end of the string
5
+ SHARED_CMD_ARGS: >-
6
+ -m src.train
7
+ +model=base_sca_multitask_v2
8
+ training.do_train=True
9
+ training.do_eval=True
10
+ training.do_inference=True
11
+ +data.streaming=False
12
+ training.max_eval_samples=800
13
+ training.max_steps=200000
14
+ training.fp16=True
15
+ training.output_dir=$AMLT_OUTPUT_DIR
16
+ training.output_log_dir=$AMLT_LOGS_DIR
17
+ model.cache_dir=/mnt/blob/weights/.model.cache/
18
+ training.save_strategy=steps
19
+ training.save_steps=5000
20
+ training.save_total_limit=3
21
+ training.optim=adamw_torch
22
+ training.evaluate_before_train=True
23
+ training.per_device_train_batch_size=1
24
+ training.evaluation_strategy=steps
25
+ training.eval_steps=5000
26
+ training.logging_steps=1000
27
+ training.logging_first_step=True
28
+ training.dataloader_num_workers=4
29
+ training.num_masks_per_sample=16
30
+ wandb.project=$AMLT_EXPERIMENT_NAME
31
+ wandb.name=$AMLT_JOB_NAME
32
+ model.num_caption_tokens=8
33
+ model.additional_num_hidden_layers=12
34
+ model.num_task_tokens=6
35
+ training.lr_scheduler_type=cosine
36
+ model.lm_head_model_name_or_path=gpt2-large
37
+ training.learning_rate=1e-4
38
+ training.weight_decay=1e-4
39
+ training.warmup_steps=200
40
+ training.warmup_ratio=0.33333333
41
+ training.compute_metrics=True
42
+
43
+
44
+
45
+ environment:
46
+
47
+ image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
48
+ # image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
49
+ # image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
50
+ registry: nvcr.io
51
+
52
+ code:
53
+ local_dir: $CONFIG_DIR/../
54
+
55
+
56
+
57
+ jobs:
58
+ - name: gpt2-large
59
+ preemptible: True
60
+ sku: ${NUM_NODES}xG${NUM_GPUS}
61
+ process_count_per_node: 1 # Each node should run 1 process
62
+ command:
63
+ - . amlt_configs/setup.sh
64
+ - source ~/.bashrc
65
+ - . amlt_configs/setup_accelerate_on_azure.sh
66
+ - >-
67
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
68
+ $SHARED_CMD_ARGS
69
+ train_data='[objects365-task_type_caption-local]'
70
+ eval_data='[objects365-task_type_caption-local]'
71
+ model.lm_head_model_name_or_path=gpt2-large
72
+ $EXTRA_ARGS
73
+
74
+ submit_args:
75
+ env:
76
+ SHARED_MEMORY_PERCENT: 0.5
77
+ HYDRA_FULL_ERROR: 1
78
+ # NCCL_IB_DISABLE: 1
79
+ # NCCL_IBEXT_DISABLE: 1
80
+ container_args:
81
+ shm_size: 256g
82
+
83
+ - name: open_llama_3b_v2
84
+ preemptible: True
85
+ sku: ${NUM_NODES}xG${NUM_GPUS}
86
+ process_count_per_node: 1 # Each node should run 1 process
87
+ command:
88
+ - . amlt_configs/setup.sh
89
+ - source ~/.bashrc
90
+ - . amlt_configs/setup_accelerate_on_azure.sh
91
+ - >-
92
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
93
+ $SHARED_CMD_ARGS
94
+ train_data='[objects365-task_type_caption-local]'
95
+ eval_data='[objects365-task_type_caption-local]'
96
+ model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
97
+ training.gradient_checkpointing=true
98
+ $EXTRA_ARGS
99
+
100
+ submit_args:
101
+ env:
102
+ SHARED_MEMORY_PERCENT: 0.5
103
+ HYDRA_FULL_ERROR: 1
104
+ # NCCL_IB_DISABLE: 1
105
+ # NCCL_IBEXT_DISABLE: 1
106
+ container_args:
107
+ shm_size: 256g
108
+
109
+
110
+
111
+
112
+ # sing octo 4x8 no-pre lsj
113
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml :1=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr train-sca-pretrain-o365-lsj-scale_lr-110923
114
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml :0=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr train-sca-pretrain-o365-lsj-scale_lr-110923
115
+
116
+ # amlt run -d "" --extra-args "training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml :1=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.ollm3bv2-large-no_lsj-1xlr train-sca-pretrain-o365-lsj-scale_lr-110923
117
+ # amlt run -d "" --extra-args "training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml :0=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.gpt2-large-no_lsj-1xlr train-sca-pretrain-o365-lsj-scale_lr-110923
118
+
119
+
120
+ # 4x8(x2)
121
+ # amlt run -d "" --extra-args "training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
122
+ # -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
123
+ # amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
124
+ # :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.ollm3bv2-large-no_lsj-1xlr-bs_2 \
125
+ # train-sca-pretrain-o365-lsj-scale_lr-110923
126
+
127
+ # amlt run -d "" --extra-args "training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
128
+ # -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
129
+ # amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
130
+ # :0=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.gpt2-large-no_lsj-1xlr-bs_2 \
131
+ # train-sca-pretrain-o365-lsj-scale_lr-110923
132
+
133
+
134
+ # 4x8(x2), 1e-4
135
+ # amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2" \
136
+ # -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
137
+ # amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
138
+ # :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.ollm3bv2-large-no_lsj-0xlr-bs_64 \
139
+ # train-sca-pretrain-o365-lsj-scale_lr-110923
140
+
141
+ # resume above due to node 4006 failed
142
+ # amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2 training.resume_from_checkpoint=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7299942105.68600-95f56dfa-4b13-45bc-8d03-aad354819319/checkpoint-45000" \
143
+ # -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
144
+ # amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
145
+ # :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.ollm3bv2-large-no_lsj-0xlr-bs_64.resume \
146
+ # train-sca-pretrain-o365-lsj-scale_lr-110923
147
+
148
+ # amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2 /mnt/blob/projects/sca-xiaoke-v3/amlt-results/7299935921.15305-a115d837-dada-4074-b41d-f66e1b187cc1/checkpoint-60000" \
149
+ # -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
150
+ # amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
151
+ # :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.ollm3bv2-large-no_lsj-0xlr-bs_64.resume.2 \
152
+ # train-sca-pretrain-o365-lsj-scale_lr-110923
153
+
154
+ # amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2" \
155
+ # -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
156
+ # amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
157
+ # :0=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.gpt2-large-no_lsj-0xlr-bs_64 \
158
+ # train-sca-pretrain-o365-lsj-scale_lr-110923
159
+
160
+ # amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=1" \
161
+ # -t msroctovc -w msroctows --sku=8xG8-V100 --no-pre \
162
+ # amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
163
+ # :0=`date +"%m%d%y"`.octo-8x8-v100-32g-pre.gpt2-large-no_lsj-0xlr-bs_64 \
164
+ # train-sca-pretrain-o365-lsj-scale_lr-110923
165
+
166
+
167
+ # resume above due to node 4006 failed
168
+ # amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2" \
169
+ # -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
170
+ # amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
171
+ # :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre-no_ib.ollm3bv2-large-no_lsj-0xlr-bs_64.rerun \
172
+ # train-sca-pretrain-o365-lsj-scale_lr-110923
173
+
174
+ # amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2" \
175
+ # -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
176
+ # amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
177
+ # :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre-_ib.ollm3bv2-large-no_lsj-0xlr-bs_64.rerun \
178
+ # train-sca-pretrain-o365-lsj-scale_lr-110923
amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ env_defaults:
2
+
3
+
4
+ # NOTE: this kind of string leaded by > will append a new line to the end of the string
5
+ SHARED_CMD_ARGS: >-
6
+ -m src.train
7
+ +model=base_sca_multitask_v2
8
+ training.do_train=True
9
+ training.do_eval=True
10
+ training.do_inference=True
11
+ +data.streaming=False
12
+ training.max_eval_samples=800
13
+ training.max_steps=100000
14
+ training.fp16=True
15
+ training.output_dir=$AMLT_OUTPUT_DIR
16
+ training.output_log_dir=$AMLT_LOGS_DIR
17
+ model.cache_dir=/mnt/blob/weights/.model.cache/
18
+ training.save_strategy=steps
19
+ training.save_steps=5000
20
+ training.save_total_limit=3
21
+ training.optim=adamw_torch
22
+ training.evaluate_before_train=True
23
+ training.per_device_train_batch_size=1
24
+ training.evaluation_strategy=steps
25
+ training.eval_steps=5000
26
+ training.logging_steps=1000
27
+ training.logging_first_step=True
28
+ training.dataloader_num_workers=4
29
+ training.num_masks_per_sample=16
30
+ wandb.project=$AMLT_EXPERIMENT_NAME
31
+ wandb.name=$AMLT_JOB_NAME
32
+ model.num_caption_tokens=8
33
+ model.additional_num_hidden_layers=12
34
+ model.num_task_tokens=6
35
+ training.lr_scheduler_type=cosine
36
+ model.lm_head_model_name_or_path=gpt2-large
37
+ training.learning_rate=1e-4
38
+ training.weight_decay=1e-4
39
+ training.warmup_steps=200
40
+ training.warmup_ratio=0.33333333
41
+ training.compute_metrics=True
42
+
43
+
44
+
45
+ environment:
46
+
47
+ image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
48
+ # image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
49
+ # image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
50
+ registry: nvcr.io
51
+
52
+ code:
53
+ local_dir: $CONFIG_DIR/../
54
+
55
+
56
+
57
+ jobs:
58
+ - name: gpt2-large
59
+ preemptible: True
60
+ sku: ${NUM_NODES}xG${NUM_GPUS}-V100-IB
61
+ process_count_per_node: 1 # Each node should run 1 process
62
+ command:
63
+ - . amlt_configs/setup.sh
64
+ - source ~/.bashrc
65
+ - . amlt_configs/setup_accelerate_on_azure.sh
66
+ - >-
67
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
68
+ $SHARED_CMD_ARGS
69
+ train_data='[vg-densecap-region_descriptions]'
70
+ eval_data='[vg-densecap-region_descriptions]'
71
+ model.lm_head_model_name_or_path=gpt2-large
72
+ $EXTRA_ARGS
73
+
74
+
75
+ submit_args:
76
+ env:
77
+ SHARED_MEMORY_PERCENT: 0.5
78
+ HYDRA_FULL_ERROR: 1
79
+ # NCCL_IB_DISABLE: 1
80
+ # NCCL_IBEXT_DISABLE: 1
81
+ container_args:
82
+ shm_size: 256g
83
+
84
+ - name: open_llama_3b_v2
85
+ preemptible: True
86
+ sku: ${NUM_NODES}xG${NUM_GPUS}-V100-IB
87
+ process_count_per_node: 1 # Each node should run 1 process
88
+ command:
89
+ - . amlt_configs/setup.sh
90
+ - source ~/.bashrc
91
+ - . amlt_configs/setup_accelerate_on_azure.sh
92
+ - >-
93
+ accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
94
+ $SHARED_CMD_ARGS
95
+ train_data='[vg-densecap-region_descriptions]'
96
+ eval_data='[vg-densecap-region_descriptions]'
97
+ model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
98
+ training.gradient_checkpointing=true
99
+ $EXTRA_ARGS
100
+
101
+
102
+ submit_args:
103
+ env:
104
+ SHARED_MEMORY_PERCENT: 0.5
105
+ HYDRA_FULL_ERROR: 1
106
+ # NCCL_IB_DISABLE: 1
107
+ # NCCL_IBEXT_DISABLE: 1
108
+ container_args:
109
+ shm_size: 256g
110
+
111
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr/checkpoint-100000/ training.max_steps=100000 training.learning_rate=4e-4" \
112
+ # -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
113
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
114
+
115
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4" \
116
+ # -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
117
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :1=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.fintune-ollmv2-lr_1e_4-1xlr-lsj train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
118
+
119
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr/checkpoint-100000/ training.max_steps=100000 training.learning_rate=4e-4" \
120
+ # -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --pre \
121
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.resrch-4x8-v100-16g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
122
+
123
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4" \
124
+ # -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --pre \
125
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :1=`date +"%m%d%y"`.resrch-4x8-v100-16g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
126
+
127
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr/checkpoint-100000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
128
+ # -t itphcrdellcl1 --vc hcrdell1 --sku=5xG4-V100 --no-pre \
129
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.dell-5x4-v100-32g-no_pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_2 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
130
+
131
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
132
+ # -t itphcrdellcl1 --vc hcrdell1 --sku=5xG4-V100 --no-pre \
133
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :1=`date +"%m%d%y"`.dell-5x4-v100-32g-no_pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_2 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
134
+
135
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr/checkpoint-100000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
136
+ # -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
137
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_2 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
138
+
139
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
140
+ # -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
141
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :1=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_2 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
142
+
143
+
144
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
145
+ # -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
146
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_2-o365_1e_4_no_lsj_bs_64 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
147
+
148
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
149
+ # -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
150
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_2-o365_1e_4_no_lsj_bs_64 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
151
+
152
+
153
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111323.backup_ckpts.pretrain-o365-no_lsj-bs_64/train-sca-pretrain-o365-lsj-scale_lr-110923/111223.rr1-4x8-v100-32g-pre.gpt2-large-no_lsj-1xlr-bs_2/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
154
+ # -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
155
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
156
+
157
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111323.backup_ckpts.pretrain-o365-no_lsj-bs_64/train-sca-pretrain-o365-lsj-scale_lr-110923/111223.rr1-4x8-v100-32g-pre.ollm3bv2-large-no_lsj-1xlr-bs_2/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
158
+ # -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
159
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :1=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
160
+
161
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228594.52312-e52ba12d-9e32-41d4-9630-e8c5d3e47ca0/checkpoint-200000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
162
+ # -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
163
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
164
+ # :1=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k \
165
+ # train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
166
+
167
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228580.84789-0b1216d8-79dc-46b3-8ef2-57c112e1bd18/checkpoint-200000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
168
+ # -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
169
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
170
+ # :0=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k \
171
+ # train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
172
+
173
+ # The o365 ollm3bv2 failed due to devices. try different clusters
174
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228594.52312-e52ba12d-9e32-41d4-9630-e8c5d3e47ca0/checkpoint-200000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=1" \
175
+ # -t msrresrchvc -w msrresrchws --sku=8xG4-V100-IB --pre \
176
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
177
+ # :1=`date +"%m%d%y"`.resrch-8x4-v100-16g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k \
178
+ # train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
179
+
180
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228594.52312-e52ba12d-9e32-41d4-9630-e8c5d3e47ca0/checkpoint-200000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=1" \
181
+ # -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
182
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
183
+ # :1=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k \
184
+ # train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
185
+
186
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228594.52312-e52ba12d-9e32-41d4-9630-e8c5d3e47ca0/checkpoint-200000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=1" \
187
+ # -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --pre \
188
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
189
+ # :1=`date +"%m%d%y"`.resrch-4x8-v100-16g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k \
190
+ # train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
191
+
192
+ # amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228594.52312-e52ba12d-9e32-41d4-9630-e8c5d3e47ca0/checkpoint-200000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
193
+ # -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
194
+ # amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
195
+ # :1=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k.2 \
196
+ # train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
data/demo_cases/1000_IM-0003-1001.dcm.png ADDED

Git LFS Details

  • SHA256: ec5a66237445d21c019acdf3c47360f3b57f982087a5b6cf46b9822ba3ec6a02
  • Pointer size: 131 Bytes
  • Size of remote file: 273 kB
data/demo_cases/1000_IM-0003-2001.dcm.png ADDED

Git LFS Details

  • SHA256: d60af61d43bf3950eb86f526e542db472f75f84d1729412110f3a832c66940d3
  • Pointer size: 131 Bytes
  • Size of remote file: 249 kB
data/demo_cases/1000_IM-0003-3001.dcm.png ADDED

Git LFS Details

  • SHA256: e568e41c38c1a5b4f825c5bc4edc59b1f336659d471479e4aa3810fe61697c70
  • Pointer size: 131 Bytes
  • Size of remote file: 261 kB
data/demo_cases/1001_IM-0004-1001.dcm.png ADDED

Git LFS Details

  • SHA256: 0140a86fe5f25863595d55c188aeb523b080ac2e3156c19c67c769b7e8c2d856
  • Pointer size: 131 Bytes
  • Size of remote file: 267 kB
data/demo_cases/1001_IM-0004-1002.dcm.png ADDED

Git LFS Details

  • SHA256: 2197f2da9ff3e632b5b1278e966305ac78858830464d4622ec873962a2f8d9d3
  • Pointer size: 131 Bytes
  • Size of remote file: 202 kB
data/demo_cases/1002_IM-0004-1001.dcm.png ADDED

Git LFS Details

  • SHA256: 034b9ad401959d441b204ef8f61dde0714094d66b05c48696d161a2f24645791
  • Pointer size: 131 Bytes
  • Size of remote file: 270 kB
data/demo_cases/1002_IM-0004-2001.dcm.png ADDED

Git LFS Details

  • SHA256: 9350a749f581acba7619427a55490bea5deb9fbf6292877a566d7f25e707f67b
  • Pointer size: 131 Bytes
  • Size of remote file: 254 kB
data/demo_cases/1003_IM-0005-2002.dcm.png ADDED

Git LFS Details

  • SHA256: b889b5eb47f8a1d2bdf99ccd7496b889e16878a99d7a37d967eec9d1559036c4
  • Pointer size: 131 Bytes
  • Size of remote file: 299 kB
data/demo_cases/1004_IM-0005-1001.dcm.png ADDED

Git LFS Details

  • SHA256: 3bee7774395b09950e0252cdc6f798c813ed2662fa5d168f863e9a28593a0cf0
  • Pointer size: 131 Bytes
  • Size of remote file: 273 kB
data/demo_cases/1004_IM-0005-2001.dcm.png ADDED

Git LFS Details

  • SHA256: 6048551a2869954e9859327ce55b31fde3f052a9f80a263e06c5b81e0548eff2
  • Pointer size: 131 Bytes
  • Size of remote file: 238 kB
data/demo_cases/1005_IM-0006-1001.dcm.png ADDED

Git LFS Details

  • SHA256: 6c18fcbe986c191bd7d0b89bcc75425446712c34ff1ab6757c4a0ee704261e6c
  • Pointer size: 131 Bytes
  • Size of remote file: 300 kB
data/demo_cases/1005_IM-0006-3003.dcm.png ADDED

Git LFS Details

  • SHA256: 921532343d4a71b62d342075b1f2126f372fd3481d9253133f557a7563f5b197
  • Pointer size: 131 Bytes
  • Size of remote file: 248 kB
data/demo_cases/1006_IM-0007-1001.dcm.png ADDED

Git LFS Details

  • SHA256: 2d3b62233c23b5f2f1e39dce450226c8ea697031242ac3e76c8366738f127eb6
  • Pointer size: 131 Bytes
  • Size of remote file: 293 kB
data/demo_cases/1006_IM-0007-3003.dcm.png ADDED

Git LFS Details

  • SHA256: 9765e0e404dcd241be967842dd14b365304f516da278e977aa4cd2d5f148529a
  • Pointer size: 131 Bytes
  • Size of remote file: 273 kB
data/demo_cases/1007_IM-0008-1001.dcm.png ADDED

Git LFS Details

  • SHA256: 46c42fd74bc706da37e656446eafee4b43ad7bbf033571889da782ca681454db
  • Pointer size: 131 Bytes
  • Size of remote file: 275 kB
data/demo_cases/1007_IM-0008-2001.dcm.png ADDED

Git LFS Details

  • SHA256: c89264ecab2b84c480a28d27e40ddd232fe7fe12942aa5782cd0fd166bd0793e
  • Pointer size: 131 Bytes
  • Size of remote file: 228 kB
data/demo_cases/1007_IM-0008-3001.dcm.png ADDED

Git LFS Details

  • SHA256: 995b6b44632e58676e7d0106647401463eb727eed9398ab525c85f0acc3cf0b3
  • Pointer size: 131 Bytes
  • Size of remote file: 272 kB
data/demo_cases/1008_IM-0009-2001.dcm.png ADDED

Git LFS Details

  • SHA256: 41421b8cf7ffa1d18ffd2ec82e44a776c8eb7336968109c3b2cde6c8756054c5
  • Pointer size: 131 Bytes
  • Size of remote file: 247 kB
data/demo_cases/1008_IM-0009-4004.dcm.png ADDED

Git LFS Details

  • SHA256: e94c68da790a105a555a4a66ef269c10d91901b832ec4542960024fc8a57050b
  • Pointer size: 131 Bytes
  • Size of remote file: 312 kB
data/demo_cases/1009_IM-0010-1001.dcm.png ADDED

Git LFS Details

  • SHA256: 5f4bffcd4ea12a253ac44dad49f115142225949647700a0cd0e230558fcb8688
  • Pointer size: 131 Bytes
  • Size of remote file: 260 kB
data/demo_cases/1009_IM-0010-2001.dcm.png ADDED

Git LFS Details

  • SHA256: 291928749fc79dc00fe5a14cfe11ae23dae34a5e6c018d1b5bcb7d30b325981b
  • Pointer size: 131 Bytes
  • Size of remote file: 214 kB
data/demo_cases/100_IM-0002-1001.dcm.png ADDED

Git LFS Details

  • SHA256: 0e0e3afc6c1ddfdfa68b99e56fd63a9ab738beb2f53165844ea9c7d6cfd9d29d
  • Pointer size: 131 Bytes
  • Size of remote file: 270 kB