Commit
·
002bd9b
0
Parent(s):
init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .amltignore +17 -0
- .gitattributes +2 -0
- .gitignore +161 -0
- LICENSE +201 -0
- README.md +74 -0
- amlt_configs/accelerate_config.yaml +18 -0
- amlt_configs/accelerate_deepspeed_config.local.yaml +26 -0
- amlt_configs/accelerate_deepspeed_config.yaml +26 -0
- amlt_configs/debug-data_mount.yaml +52 -0
- amlt_configs/debug-sca.yaml +65 -0
- amlt_configs/debug.yaml +51 -0
- amlt_configs/infer-sam_captioner-region_chunkify-eval_suite.yaml +69 -0
- amlt_configs/infer-sca-eval_suite-ckpt.yaml +133 -0
- amlt_configs/infer-sca-eval_suite-coco_instance_task_type_caption-last_model.yaml +95 -0
- amlt_configs/infer-sca-eval_suite-vg-best_model.yaml +96 -0
- amlt_configs/infer-sca-eval_suite-vg-last_model.yaml +96 -0
- amlt_configs/post_process.sh +2 -0
- amlt_configs/setup.sh +144 -0
- amlt_configs/setup_accelerate_on_azure.sh +53 -0
- amlt_configs/setup_eval_suite.sh +28 -0
- amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml +126 -0
- amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml +128 -0
- amlt_configs/train-sca-ablat-model_arch-103123.yaml +112 -0
- amlt_configs/train-sca-ablat-sam_size-110423.yaml +108 -0
- amlt_configs/train-sca-ablat-timm.yaml +143 -0
- amlt_configs/train-sca-ablat-weak_sup_data.yaml +327 -0
- amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml +178 -0
- amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml +196 -0
- data/demo_cases/1000_IM-0003-1001.dcm.png +3 -0
- data/demo_cases/1000_IM-0003-2001.dcm.png +3 -0
- data/demo_cases/1000_IM-0003-3001.dcm.png +3 -0
- data/demo_cases/1001_IM-0004-1001.dcm.png +3 -0
- data/demo_cases/1001_IM-0004-1002.dcm.png +3 -0
- data/demo_cases/1002_IM-0004-1001.dcm.png +3 -0
- data/demo_cases/1002_IM-0004-2001.dcm.png +3 -0
- data/demo_cases/1003_IM-0005-2002.dcm.png +3 -0
- data/demo_cases/1004_IM-0005-1001.dcm.png +3 -0
- data/demo_cases/1004_IM-0005-2001.dcm.png +3 -0
- data/demo_cases/1005_IM-0006-1001.dcm.png +3 -0
- data/demo_cases/1005_IM-0006-3003.dcm.png +3 -0
- data/demo_cases/1006_IM-0007-1001.dcm.png +3 -0
- data/demo_cases/1006_IM-0007-3003.dcm.png +3 -0
- data/demo_cases/1007_IM-0008-1001.dcm.png +3 -0
- data/demo_cases/1007_IM-0008-2001.dcm.png +3 -0
- data/demo_cases/1007_IM-0008-3001.dcm.png +3 -0
- data/demo_cases/1008_IM-0009-2001.dcm.png +3 -0
- data/demo_cases/1008_IM-0009-4004.dcm.png +3 -0
- data/demo_cases/1009_IM-0010-1001.dcm.png +3 -0
- data/demo_cases/1009_IM-0010-2001.dcm.png +3 -0
- data/demo_cases/100_IM-0002-1001.dcm.png +3 -0
.amltignore
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/exp*
|
2 |
+
/tmp
|
3 |
+
|
4 |
+
/.mypy_cache
|
5 |
+
__pycache__/
|
6 |
+
/build
|
7 |
+
/data
|
8 |
+
/results
|
9 |
+
*.egg-info/
|
10 |
+
scripts/examples/
|
11 |
+
.amltconfig
|
12 |
+
/amlt
|
13 |
+
.*cache/
|
14 |
+
wandb/
|
15 |
+
|
16 |
+
build/
|
17 |
+
*.egg-info/
|
.gitattributes
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
data/**/* filter=lfs diff=lfs merge=lfs -text
|
2 |
+
Medical-SAM2/data/** filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
pip-wheel-metadata/
|
24 |
+
share/python-wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
MANIFEST
|
29 |
+
|
30 |
+
# PyInstaller
|
31 |
+
# Usually these files are written by a python script from a template
|
32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
33 |
+
*.manifest
|
34 |
+
*.spec
|
35 |
+
|
36 |
+
# Installer logs
|
37 |
+
pip-log.txt
|
38 |
+
pip-delete-this-directory.txt
|
39 |
+
|
40 |
+
# Unit test / coverage reports
|
41 |
+
htmlcov/
|
42 |
+
.tox/
|
43 |
+
.nox/
|
44 |
+
.coverage
|
45 |
+
.coverage.*
|
46 |
+
.cache
|
47 |
+
nosetests.xml
|
48 |
+
coverage.xml
|
49 |
+
*.cover
|
50 |
+
*.py,cover
|
51 |
+
.hypothesis/
|
52 |
+
.pytest_cache/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
target/
|
76 |
+
|
77 |
+
# Jupyter Notebook
|
78 |
+
.ipynb_checkpoints
|
79 |
+
|
80 |
+
# IPython
|
81 |
+
profile_default/
|
82 |
+
ipython_config.py
|
83 |
+
|
84 |
+
# pyenv
|
85 |
+
.python-version
|
86 |
+
|
87 |
+
# pipenv
|
88 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
89 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
90 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
91 |
+
# install all needed dependencies.
|
92 |
+
#Pipfile.lock
|
93 |
+
|
94 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
95 |
+
__pypackages__/
|
96 |
+
|
97 |
+
# Celery stuff
|
98 |
+
celerybeat-schedule
|
99 |
+
celerybeat.pid
|
100 |
+
|
101 |
+
# SageMath parsed files
|
102 |
+
*.sage.py
|
103 |
+
|
104 |
+
# Environments
|
105 |
+
.env
|
106 |
+
.venv
|
107 |
+
env/
|
108 |
+
venv/
|
109 |
+
ENV/
|
110 |
+
env.bak/
|
111 |
+
venv.bak/
|
112 |
+
|
113 |
+
# Spyder project settings
|
114 |
+
.spyderproject
|
115 |
+
.spyproject
|
116 |
+
|
117 |
+
# Rope project settings
|
118 |
+
.ropeproject
|
119 |
+
|
120 |
+
# mkdocs documentation
|
121 |
+
/site
|
122 |
+
|
123 |
+
# mypy
|
124 |
+
.mypy_cache/
|
125 |
+
.dmypy.json
|
126 |
+
dmypy.json
|
127 |
+
|
128 |
+
# Pyre type checker
|
129 |
+
.pyre/
|
130 |
+
|
131 |
+
wandb/
|
132 |
+
|
133 |
+
# VSCode
|
134 |
+
.vscode/*
|
135 |
+
# !.vscode/settings.json
|
136 |
+
# !.vscode/tasks.json
|
137 |
+
# !.vscode/launch.json
|
138 |
+
# !.vscode/extensions.json
|
139 |
+
|
140 |
+
# Hydra
|
141 |
+
.hydra
|
142 |
+
multirun.yaml
|
143 |
+
.submitit
|
144 |
+
|
145 |
+
# These should be symlinked.
|
146 |
+
exp
|
147 |
+
.*cache/
|
148 |
+
/tmp
|
149 |
+
|
150 |
+
# Download data manually.
|
151 |
+
data/all_instances_82K.jsonl
|
152 |
+
data/alpaca_data.json
|
153 |
+
data/user_oriented_instructions.jsonl
|
154 |
+
|
155 |
+
# Ignore amlt files
|
156 |
+
.amltconfig
|
157 |
+
/amlt
|
158 |
+
|
159 |
+
# Ignore slurm files
|
160 |
+
**/*slurm*/**
|
161 |
+
*.slurm
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
README.md
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Segment and Caption Anything
|
2 |
+
|
3 |
+
The repository contains the official implementation of "Segment and Caption Anything"
|
4 |
+
|
5 |
+
[Project Page](https://xk-huang.github.io/segment-caption-anything), [Paper](https://arxiv.org/abs/2312.00869)
|
6 |
+
|
7 |
+

|
8 |
+
|
9 |
+
tl;dr
|
10 |
+
1. Despite the absence of semantic labels in the training data, SAM implies high-level semantics sufficient for captioning.
|
11 |
+
2. SCA (b) is a lightweight augmentation of SAM (a) with the ability to generate regional captions.
|
12 |
+
3. On top of SAM architecture, we add a fixed pre-trained language mode, and a optimizable lightweight hybrid feature mixture whose training is cheap and scalable.
|
13 |
+
|
14 |
+
<table>
|
15 |
+
<tr>
|
16 |
+
<td><img src="./docs/anything-mode-00.png.jpg" alt="anything-mode-00"></td>
|
17 |
+
<td><img src="./docs/anything-mode-03.png.jpg" alt="anything-mode-01"></td>
|
18 |
+
</tr>
|
19 |
+
<tr>
|
20 |
+
<td><img src="./docs/anything-mode-01.png.jpg" alt="anything-mode-02"></td>
|
21 |
+
<td><img src="./docs/anything-mode-02.png.jpg" alt="anything-mode-03"></td>
|
22 |
+
</tr>
|
23 |
+
</table>
|
24 |
+
|
25 |
+
News
|
26 |
+
|
27 |
+
- [01/31/2024] Update the [paper](https://xk-huang.github.io/segment-caption-anything/files/segment-caption-anything.013124.pdf) and the [supp](https://xk-huang.github.io/segment-caption-anything/files/segment-caption-anything-supp.013124.pdf). Release code v0.0.2: bump transformers to 4.36.2, support mistral series, phi-2, zephyr; add experiments about SAM+Image Captioner+[V-CoT](https://github.com/ttengwang/Caption-Anything), and more.
|
28 |
+
- [12/05/2023] Release paper, code v0.0.1, and project page!
|
29 |
+
|
30 |
+
## Environment Preparation
|
31 |
+
|
32 |
+
Please check [docs/ENV.md](docs/ENV.md).
|
33 |
+
|
34 |
+
|
35 |
+
## Model Zoo
|
36 |
+
|
37 |
+
Please check [docs/MODEL_ZOO.md](docs/MODEL_ZOO.md)
|
38 |
+
|
39 |
+
|
40 |
+
## Gradio Demo
|
41 |
+
|
42 |
+
Please check [docs/DEMO.md](docs/DEMO.md)
|
43 |
+
|
44 |
+
|
45 |
+
## Running Training and Inference
|
46 |
+
|
47 |
+
Please check [docs/USAGE.md](docs/USAGE.md).
|
48 |
+
|
49 |
+
|
50 |
+
## Experiments and Evaluation
|
51 |
+
|
52 |
+
Please check [docs/EVAL.md](docs/EVAL.md)
|
53 |
+
|
54 |
+
## License
|
55 |
+
|
56 |
+
The trained weights are licensed under the [Apache 2.0 license](https://github.com/xk-huang/segment-caption-anything/blob/1c810bfcfeb3b95cd4b1f502f8f30c46333d58b8/LICENSE).
|
57 |
+
|
58 |
+
## Acknowledgement
|
59 |
+
|
60 |
+
Deeply appreciate these wonderful open source projects: [transformers](https://github.com/huggingface/transformers), [accelerate](https://github.com/huggingface/accelerate), [deepspeed](https://github.com/microsoft/DeepSpeed), [detectron2](https://github.com/facebookresearch/detectron2), [hydra](https://github.com/facebookresearch/hydra), [timm](https://github.com/huggingface/pytorch-image-models), [gradio](https://github.com/gradio-app/gradio).
|
61 |
+
|
62 |
+
## Citation
|
63 |
+
|
64 |
+
If you find this repository useful, please consider giving a star ⭐ and citation 🦖:
|
65 |
+
|
66 |
+
```
|
67 |
+
@misc{xiaoke2023SCA,
|
68 |
+
title={{Segment and Caption Anything}},
|
69 |
+
author={Xiaoke, Huang and Jianfeng, Wang and Yansong, Tang and Zheng, Zhang and Han, Hu and Jiwen, Lu and Lijuan, Wang and Zicheng, Liu},
|
70 |
+
journal={arXiv},
|
71 |
+
volume={abs/2312.00869},
|
72 |
+
year={2023},
|
73 |
+
}
|
74 |
+
```
|
amlt_configs/accelerate_config.yaml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# https://huggingface.co/docs/transformers/v4.32.1/en/main_classes/trainer#using-accelerate-launcher-with-trainer
|
2 |
+
compute_environment: LOCAL_MACHINE
|
3 |
+
distributed_type: MULTI_GPU
|
4 |
+
downcast_bf16: 'no'
|
5 |
+
gpu_ids: all
|
6 |
+
machine_rank: 0 # change this for each node
|
7 |
+
main_process_ip: node-0 # the machines on Azure is inter-connected, so you can directly config it according to `~/.ssh/config`
|
8 |
+
main_process_port: 11451 # change this as you like
|
9 |
+
main_training_function: main
|
10 |
+
mixed_precision: fp16
|
11 |
+
num_machines: 2 # change this for all nodes
|
12 |
+
num_processes: 8 # changet this for all nodes. all the gpu processes among the nodes.
|
13 |
+
rdzv_backend: static
|
14 |
+
same_network: true
|
15 |
+
tpu_env: []
|
16 |
+
tpu_use_cluster: false
|
17 |
+
tpu_use_sudo: false
|
18 |
+
use_cpu: false
|
amlt_configs/accelerate_deepspeed_config.local.yaml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# https://huggingface.co/docs/transformers/v4.32.1/en/main_classes/trainer#using-accelerate-launcher-with-trainer
|
2 |
+
compute_environment: LOCAL_MACHINE
|
3 |
+
deepspeed_config:
|
4 |
+
deepspeed_multinode_launcher: standard
|
5 |
+
gradient_accumulation_steps: 1
|
6 |
+
offload_optimizer_device: none
|
7 |
+
offload_param_device: none
|
8 |
+
zero3_init_flag: false
|
9 |
+
zero_stage: 2
|
10 |
+
gradient_clipping: 1.0
|
11 |
+
distributed_type: DEEPSPEED
|
12 |
+
downcast_bf16: 'no'
|
13 |
+
gpu_ids: all
|
14 |
+
machine_rank: 0 # change this for each node
|
15 |
+
main_process_ip: localhost # the machines on Azure is inter-connected, so you can directly config it according to `~/.ssh/config`
|
16 |
+
main_process_port: 11451 # change this as you like
|
17 |
+
main_training_function: main
|
18 |
+
mixed_precision: fp16
|
19 |
+
num_machines: 1 # change this for all nodes
|
20 |
+
num_processes: 1 # changet this for all nodes. all the gpu processes among the nodes.
|
21 |
+
rdzv_backend: static
|
22 |
+
same_network: true
|
23 |
+
tpu_env: []
|
24 |
+
tpu_use_cluster: false
|
25 |
+
tpu_use_sudo: false
|
26 |
+
use_cpu: false
|
amlt_configs/accelerate_deepspeed_config.yaml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# https://huggingface.co/docs/transformers/v4.32.1/en/main_classes/trainer#using-accelerate-launcher-with-trainer
|
2 |
+
compute_environment: LOCAL_MACHINE
|
3 |
+
deepspeed_config:
|
4 |
+
deepspeed_multinode_launcher: standard
|
5 |
+
gradient_accumulation_steps: 1
|
6 |
+
offload_optimizer_device: none
|
7 |
+
offload_param_device: none
|
8 |
+
zero3_init_flag: false
|
9 |
+
zero_stage: 2
|
10 |
+
gradient_clipping: 1.0
|
11 |
+
distributed_type: DEEPSPEED
|
12 |
+
downcast_bf16: 'no'
|
13 |
+
gpu_ids: all
|
14 |
+
machine_rank: 0 # change this for each node
|
15 |
+
main_process_ip: node-0 # the machines on Azure is inter-connected, so you can directly config it according to `~/.ssh/config`
|
16 |
+
main_process_port: 11451 # change this as you like
|
17 |
+
main_training_function: main
|
18 |
+
mixed_precision: fp16
|
19 |
+
num_machines: 2 # change this for all nodes
|
20 |
+
num_processes: 8 # changet this for all nodes. all the gpu processes among the nodes.
|
21 |
+
rdzv_backend: static
|
22 |
+
same_network: true
|
23 |
+
tpu_env: []
|
24 |
+
tpu_use_cluster: false
|
25 |
+
tpu_use_sudo: false
|
26 |
+
use_cpu: false
|
amlt_configs/debug-data_mount.yaml
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
SHARED_CMD_ARGS: '
|
4 |
+
-m src.train
|
5 |
+
train_data=[vg-densecap-region_descriptions] eval_data=[vg-densecap-region_descriptions]
|
6 |
+
+model=base_sam_captioner
|
7 |
+
training.do_train=False
|
8 |
+
training.do_eval=False
|
9 |
+
training.do_inference=True
|
10 |
+
training.num_masks_per_sample=1
|
11 |
+
+data.streaming=False
|
12 |
+
training.max_eval_samples=10
|
13 |
+
training.max_train_samples=1
|
14 |
+
training.num_train_epochs=10
|
15 |
+
training.fp16=True
|
16 |
+
training.output_dir=$AMLT_OUTPUT_DIR
|
17 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
18 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
19 |
+
training.dataloader_num_workers=4
|
20 |
+
'
|
21 |
+
|
22 |
+
|
23 |
+
environment:
|
24 |
+
|
25 |
+
image: nvidia/pytorch:23.07-py3
|
26 |
+
registry: nvcr.io
|
27 |
+
|
28 |
+
code:
|
29 |
+
local_dir: $CONFIG_DIR/../
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
jobs:
|
34 |
+
- name: sam_captioner-infer-debug
|
35 |
+
sku: G$NUM_GPUS
|
36 |
+
preemptible: False
|
37 |
+
process_count_per_node: 1 # Each node should run 1 process
|
38 |
+
command:
|
39 |
+
- . amlt_configs/setup.sh
|
40 |
+
- source ~/.bashrc
|
41 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
42 |
+
- . amlt_configs/post_process.sh
|
43 |
+
- accelerate launch --config_file amlt_configs/accelerate_config.yaml $SHARED_CMD_ARGS || . amlt_configs/post_process.sh
|
44 |
+
- . amlt_configs/post_process.sh
|
45 |
+
|
46 |
+
|
47 |
+
submit_args:
|
48 |
+
env:
|
49 |
+
AZFUSE_USE_FUSE: "1"
|
50 |
+
SHARED_MEMORY_PERCENT: 0.5
|
51 |
+
container_args:
|
52 |
+
shm_size: 256g
|
amlt_configs/debug-sca.yaml
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
SHARED_CMD_ARGS: >
|
4 |
+
-m src.train
|
5 |
+
train_data='[vg-densecap-region_descriptions]' eval_data='[vg-densecap-region_descriptions]'
|
6 |
+
+model=base_sca
|
7 |
+
training.do_train=True
|
8 |
+
training.do_eval=True
|
9 |
+
training.do_inference=True
|
10 |
+
+data.streaming=False
|
11 |
+
training.max_eval_samples=800
|
12 |
+
training.max_steps=200000
|
13 |
+
training.fp16=True
|
14 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
15 |
+
training.save_strategy=steps
|
16 |
+
training.save_steps=5000
|
17 |
+
training.save_total_limit=3
|
18 |
+
training.optim=adamw_torch
|
19 |
+
training.evaluate_before_train=True
|
20 |
+
training.per_device_train_batch_size=1
|
21 |
+
training.evaluation_strategy=steps
|
22 |
+
training.eval_steps=5000
|
23 |
+
training.logging_steps=1000
|
24 |
+
training.logging_first_step=True
|
25 |
+
training.lr_scheduler_type=constant
|
26 |
+
training.warmup_steps=2000
|
27 |
+
training.learning_rate=1e-4
|
28 |
+
model.lm_head_model_name_or_path=gpt2-large
|
29 |
+
training.dataloader_num_workers=4
|
30 |
+
training.num_masks_per_sample=8
|
31 |
+
model.num_caption_tokens=8
|
32 |
+
training.output_dir=$AMLT_OUTPUT_DIR
|
33 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
34 |
+
wandb.group=$AMLT_EXPERIMENT_NAME-$AMLT_DESCRIPTION
|
35 |
+
wandb.name=$AMLT_JOB_NAME
|
36 |
+
|
37 |
+
|
38 |
+
environment:
|
39 |
+
|
40 |
+
image: nvidia/pytorch:23.07-py3
|
41 |
+
registry: nvcr.io
|
42 |
+
|
43 |
+
code:
|
44 |
+
local_dir: $CONFIG_DIR/../
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
jobs:
|
49 |
+
- name: sca-debug
|
50 |
+
sku: G$NUM_GPUS
|
51 |
+
process_count_per_node: 1 # Each node should run 1 process
|
52 |
+
preemptible: False
|
53 |
+
command:
|
54 |
+
- . amlt_configs/setup.sh
|
55 |
+
- source ~/.bashrc
|
56 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
57 |
+
- . amlt_configs/post_process.sh
|
58 |
+
# - accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml $SHARED_CMD_ARGS || . amlt_configs/post_process.sh
|
59 |
+
|
60 |
+
submit_args:
|
61 |
+
env:
|
62 |
+
AZFUSE_USE_FUSE: "1"
|
63 |
+
SHARED_MEMORY_PERCENT: 0.5
|
64 |
+
container_args:
|
65 |
+
shm_size: 256g
|
amlt_configs/debug.yaml
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
SHARED_CMD_ARGS: '
|
4 |
+
-m src.train
|
5 |
+
train_data=[vg-densecap-region_descriptions] eval_data=[vg-densecap-region_descriptions]
|
6 |
+
+model=base_sam_captioner
|
7 |
+
training.do_train=False
|
8 |
+
training.do_eval=False
|
9 |
+
training.do_inference=True
|
10 |
+
training.num_masks_per_sample=1
|
11 |
+
+data.streaming=False
|
12 |
+
training.max_eval_samples=10
|
13 |
+
training.max_train_samples=1
|
14 |
+
training.num_train_epochs=10
|
15 |
+
training.fp16=True
|
16 |
+
training.output_dir=$AMLT_OUTPUT_DIR
|
17 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
18 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
19 |
+
training.dataloader_num_workers=4
|
20 |
+
'
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
environment:
|
25 |
+
image: nvidia/pytorch:23.07-py3
|
26 |
+
registry: nvcr.io
|
27 |
+
|
28 |
+
code:
|
29 |
+
local_dir: $CONFIG_DIR/../
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
jobs:
|
34 |
+
- name: sam_captioner-infer-debug
|
35 |
+
sku: G$NUM_GPUS
|
36 |
+
preemptible: False
|
37 |
+
process_count_per_node: 1 # Each node should run 1 process
|
38 |
+
command:
|
39 |
+
- . amlt_configs/setup.sh
|
40 |
+
- source ~/.bashrc
|
41 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
42 |
+
- . amlt_configs/post_process.sh
|
43 |
+
# - accelerate launch --config_file amlt_configs/accelerate_config.yaml $SHARED_CMD_ARGS || . amlt_configs/post_process.sh
|
44 |
+
|
45 |
+
|
46 |
+
submit_args:
|
47 |
+
env:
|
48 |
+
AZFUSE_USE_FUSE: "1"
|
49 |
+
SHARED_MEMORY_PERCENT: 0.5
|
50 |
+
container_args:
|
51 |
+
shm_size: 256g
|
amlt_configs/infer-sam_captioner-region_chunkify-eval_suite.yaml
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
SHARED_CMD_ARGS: '
|
4 |
+
-m src.train
|
5 |
+
train_data=[vg-densecap-region_descriptions] eval_data=[vg-densecap-region_descriptions]
|
6 |
+
+model=base_sam_captioner
|
7 |
+
training.do_train=False
|
8 |
+
training.do_eval=False
|
9 |
+
training.do_inference=True
|
10 |
+
+data.streaming=False
|
11 |
+
training.fp16=True
|
12 |
+
training.output_dir=$AMLT_OUTPUT_DIR
|
13 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
14 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
15 |
+
training.dataloader_num_workers=4
|
16 |
+
'
|
17 |
+
|
18 |
+
environment:
|
19 |
+
|
20 |
+
image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
|
21 |
+
registry: nvcr.io
|
22 |
+
|
23 |
+
code:
|
24 |
+
local_dir: $CONFIG_DIR/../
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
# Salesforce/blip-image-captioning-large, Salesforce/blip-image-captioning-base, microsoft/git-large-coco, microsoft/git-large-textcaps, microsoft/git-large, microsoft/git-base-coco, microsoft/git-base-textcaps, microsoft/git-base
|
29 |
+
# LM_MODEL='Salesforce/blip-image-captioning-large' && amlt run config.yaml :Salesforce/blip-image-captioning-large=$LM_MODEL --extra-args "model.captioner_model_name_or_path=$LM_MODEL"
|
30 |
+
jobs:
|
31 |
+
- name: Salesforce/blip-image-captioning-large
|
32 |
+
sku: G$NUM_GPUS
|
33 |
+
preemptible: False
|
34 |
+
command:
|
35 |
+
- . amlt_configs/setup.sh
|
36 |
+
- source ~/.bashrc
|
37 |
+
- pip install pydantic==1.10.8 # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
|
38 |
+
- . amlt_configs/setup_eval_suite.sh
|
39 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
40 |
+
|
41 |
+
# caption
|
42 |
+
- DATASET=vg-densecap-region_descriptions
|
43 |
+
- accelerate launch --num_processes $NUM_GPUS $SHARED_CMD_ARGS model.captioner_model_name_or_path=Salesforce/blip-image-captioning-large train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
44 |
+
- bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-vg-densecap-region_descriptions/region_img_annot_caption/visual_genome.py-region_descriptions_v1.2.0-test.region_img.tsv
|
45 |
+
|
46 |
+
- DATASET=refcoco-google
|
47 |
+
- accelerate launch --num_processes $NUM_GPUS $SHARED_CMD_ARGS model.captioner_model_name_or_path=Salesforce/blip-image-captioning-large train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
48 |
+
- bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-refcoco-google/region_img_annot_caption/refcoco.py-refcoco-google-test.region_img.tsv
|
49 |
+
|
50 |
+
# concept
|
51 |
+
- DATASET=coco-instance
|
52 |
+
- accelerate launch --num_processes $NUM_GPUS $SHARED_CMD_ARGS model.captioner_model_name_or_path=Salesforce/blip-image-captioning-large train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
53 |
+
- SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
54 |
+
|
55 |
+
# OOM and every slow
|
56 |
+
# - DATASET=objects365-local
|
57 |
+
# - accelerate launch --num_processes $NUM_GPUS $SHARED_CMD_ARGS model.captioner_model_name_or_path=Salesforce/blip-image-captioning-large train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
58 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
59 |
+
|
60 |
+
# OOM and every slow
|
61 |
+
# - DATASET=v3det-local
|
62 |
+
# - accelerate launch --num_processes $NUM_GPUS $SHARED_CMD_ARGS model.captioner_model_name_or_path=Salesforce/blip-image-captioning-large train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
63 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
64 |
+
|
65 |
+
submit_args:
|
66 |
+
env:
|
67 |
+
SHARED_MEMORY_PERCENT: 0.5
|
68 |
+
container_args:
|
69 |
+
shm_size: 256g
|
amlt_configs/infer-sca-eval_suite-ckpt.yaml
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
SHARED_CMD_ARGS: >-
|
4 |
+
-m src.train
|
5 |
+
+model=base_sca
|
6 |
+
training.do_train=False
|
7 |
+
training.do_eval=False
|
8 |
+
training.do_inference=True
|
9 |
+
training.fp16=True
|
10 |
+
wandb.log=False
|
11 |
+
training.dataloader_num_workers=4
|
12 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
13 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
environment:
|
20 |
+
|
21 |
+
image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
|
22 |
+
registry: nvcr.io
|
23 |
+
|
24 |
+
code:
|
25 |
+
local_dir: $CONFIG_DIR/../
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
jobs:
|
30 |
+
- name: infer-eval_suite
|
31 |
+
sku: G$NUM_GPUS
|
32 |
+
preemptible: False
|
33 |
+
command:
|
34 |
+
- . amlt_configs/setup.sh
|
35 |
+
- source ~/.bashrc
|
36 |
+
- pip install pydantic==1.10.8 # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
|
37 |
+
- . amlt_configs/setup_eval_suite.sh
|
38 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
39 |
+
|
40 |
+
# caption
|
41 |
+
- DATASET=vg-densecap-local
|
42 |
+
- >-
|
43 |
+
accelerate launch $SHARED_CMD_ARGS
|
44 |
+
train_data=[$$DATASET]
|
45 |
+
eval_data=[$$DATASET]
|
46 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
|
47 |
+
$EXTRA_ARGS
|
48 |
+
- SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
49 |
+
|
50 |
+
- DATASET=refcocog-google
|
51 |
+
- >-
|
52 |
+
accelerate launch $SHARED_CMD_ARGS
|
53 |
+
train_data=[$$DATASET]
|
54 |
+
eval_data=[$$DATASET]
|
55 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
|
56 |
+
$EXTRA_ARGS
|
57 |
+
- SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
58 |
+
|
59 |
+
- DATASET=refcoco-unc-split_testA
|
60 |
+
- >-
|
61 |
+
accelerate launch $SHARED_CMD_ARGS
|
62 |
+
train_data=[$$DATASET]
|
63 |
+
eval_data=[$$DATASET]
|
64 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
|
65 |
+
$EXTRA_ARGS
|
66 |
+
- SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
67 |
+
|
68 |
+
- DATASET=refcoco-unc-split_testB
|
69 |
+
- >-
|
70 |
+
accelerate launch $SHARED_CMD_ARGS
|
71 |
+
train_data=[$$DATASET]
|
72 |
+
eval_data=[$$DATASET]
|
73 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
|
74 |
+
$EXTRA_ARGS
|
75 |
+
- SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
76 |
+
|
77 |
+
- DATASET=refcoco+-unc-split_testA
|
78 |
+
- >-
|
79 |
+
accelerate launch $SHARED_CMD_ARGS
|
80 |
+
train_data=[$$DATASET]
|
81 |
+
eval_data=[$$DATASET]
|
82 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
|
83 |
+
$EXTRA_ARGS
|
84 |
+
- SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
85 |
+
|
86 |
+
- DATASET=refcoco+-unc-split_testB
|
87 |
+
- >-
|
88 |
+
accelerate launch $SHARED_CMD_ARGS
|
89 |
+
train_data=[$$DATASET]
|
90 |
+
eval_data=[$$DATASET]
|
91 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
|
92 |
+
$EXTRA_ARGS
|
93 |
+
- SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
94 |
+
|
95 |
+
# concept
|
96 |
+
# - DATASET=coco-instance
|
97 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
98 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
99 |
+
|
100 |
+
# OOM and every slow
|
101 |
+
# - DATASET=objects365-local
|
102 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
103 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
104 |
+
|
105 |
+
# OOM and every slow
|
106 |
+
# - DATASET=v3det-local
|
107 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
108 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
109 |
+
|
110 |
+
submit_args:
|
111 |
+
env:
|
112 |
+
SHARED_MEMORY_PERCENT: 0.5
|
113 |
+
container_args:
|
114 |
+
shm_size: 256g
|
115 |
+
|
116 |
+
# CKPT_PATHS=(
|
117 |
+
# /mnt/blob/weights/sca-weights.111823/finetune-gpt2_large-lr_1e_4-1xlr-lsj-bs_1-pretrain_1e_4_no_lsj_bs_32.111223.rr1-4x8-v100-32g-pre/checkpoint-100000
|
118 |
+
# /mnt/blob/weights/sca-weights.111823/gpt2-large-lsj-1xlr.110423.octo-4x8-v100-16g-no_pre/checkpoint-200000
|
119 |
+
# /mnt/blob/weights/sca-weights.111823/ollm3bv2-large-lsj-1xlr.110423.octo-4x8-v100-16g-no_pre/checkpoint-200000
|
120 |
+
# /mnt/blob/weights/sca-weights.111823/pretrain_1e_4_no_lsj_bs_32.110523.rr1-4x8-v100-32g-pre/checkpoint-100000
|
121 |
+
# )
|
122 |
+
# for CKPT_PATH in ${CKPT_PATHS[@]} ; do
|
123 |
+
# CKPT_NAME=$(basename $(dirname $CKPT_PATH))
|
124 |
+
# echo $CKPT_NAME
|
125 |
+
# amlt run \
|
126 |
+
# -d "" --extra-args "training.generation_num_beams=3 training.fp16_full_eval=True model.model_name_or_path=$CKPT_PATH model.lm_head_model_name_or_path=\$(python scripts/tools/get_sub_model_name_from_ckpt.py $CKPT_PATH lm) model.sam_model_name_or_path=facebook/sam-vit-huge" \
|
127 |
+
# -t msroctovc -w msroctows --no-pre \
|
128 |
+
# --sku G4-V100 \
|
129 |
+
# amlt_configs/infer-sca-eval_suite-ckpt.yaml \
|
130 |
+
# :0=$CKPT_NAME \
|
131 |
+
# `date +"%m%d%y"`.infer-ckpt-all_dataset \
|
132 |
+
# -y
|
133 |
+
# done
|
amlt_configs/infer-sca-eval_suite-coco_instance_task_type_caption-last_model.yaml
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
SHARED_CMD_ARGS: >-
|
4 |
+
-m src.train
|
5 |
+
+model=base_sca
|
6 |
+
training.do_train=False
|
7 |
+
training.do_eval=False
|
8 |
+
training.do_inference=True
|
9 |
+
training.fp16=True
|
10 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
11 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
12 |
+
wandb.log=False
|
13 |
+
training.dataloader_num_workers=4
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
environment:
|
19 |
+
|
20 |
+
image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
|
21 |
+
registry: nvcr.io
|
22 |
+
|
23 |
+
code:
|
24 |
+
local_dir: $CONFIG_DIR/../
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
jobs:
|
29 |
+
- name: infer-eval_suite
|
30 |
+
sku: G$NUM_GPUS
|
31 |
+
preemptible: False
|
32 |
+
command:
|
33 |
+
- . amlt_configs/setup.sh
|
34 |
+
- source ~/.bashrc
|
35 |
+
- pip install pydantic==1.10.8 # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
|
36 |
+
- . amlt_configs/setup_eval_suite.sh
|
37 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
38 |
+
|
39 |
+
# get best (or max step) model
|
40 |
+
- BEST_CKPT_STEP=$$(python scripts/tools/get_model_name_from_trainer_state.py $$AMLT_MAP_INPUT_DIR "last")
|
41 |
+
- BEST_CKPT_PATH=$$(find $$AMLT_MAP_INPUT_DIR -name '*checkpoint*' | grep $$BEST_CKPT_STEP | tail -n1)
|
42 |
+
|
43 |
+
# caption
|
44 |
+
- DATASET=coco-instance-task_type_caption-local
|
45 |
+
- >-
|
46 |
+
accelerate launch $SHARED_CMD_ARGS
|
47 |
+
train_data=[$$DATASET]
|
48 |
+
eval_data=[$$DATASET]
|
49 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
|
50 |
+
model.model_name_or_path=$$BEST_CKPT_PATH
|
51 |
+
model.lm_head_model_name_or_path=$(python scripts/tools/get_sub_model_name_from_ckpt.py $$BEST_CKPT_PATH "lm")
|
52 |
+
model.sam_model_name_or_path=facebook/sam-vit-huge
|
53 |
+
$EXTRA_ARGS
|
54 |
+
- bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-coco-instance-local/region_img_annot_caption/coco_instance-local.py-2017-validation.region_img.tsv
|
55 |
+
|
56 |
+
# - DATASET=refcocog-google
|
57 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
58 |
+
# - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-refcocog-google/region_img_annot_caption/refcoco.py-refcocog-google-validation.region_img.tsv
|
59 |
+
|
60 |
+
# - DATASET=refcoco-unc-split_testA
|
61 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
62 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
63 |
+
|
64 |
+
# - DATASET=refcoco-unc-split_testB
|
65 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
66 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
67 |
+
|
68 |
+
# - DATASET=refcoco+-unc-split_testA
|
69 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
70 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
71 |
+
|
72 |
+
# - DATASET=refcoco+-unc-split_testB
|
73 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
74 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
75 |
+
|
76 |
+
# concept
|
77 |
+
# - DATASET=coco-instance
|
78 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
|
79 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
80 |
+
|
81 |
+
# OOM and every slow
|
82 |
+
# - DATASET=objects365-local
|
83 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
|
84 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
85 |
+
|
86 |
+
# OOM and every slow
|
87 |
+
# - DATASET=v3det-local
|
88 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
|
89 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
90 |
+
|
91 |
+
submit_args:
|
92 |
+
env:
|
93 |
+
SHARED_MEMORY_PERCENT: 0.5
|
94 |
+
container_args:
|
95 |
+
shm_size: 256g
|
amlt_configs/infer-sca-eval_suite-vg-best_model.yaml
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
SHARED_CMD_ARGS: >-
|
4 |
+
-m src.train
|
5 |
+
+model=base_sca
|
6 |
+
training.do_train=False
|
7 |
+
training.do_eval=False
|
8 |
+
training.do_inference=True
|
9 |
+
training.fp16=True
|
10 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
11 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
12 |
+
wandb.log=False
|
13 |
+
training.dataloader_num_workers=4
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
environment:
|
19 |
+
|
20 |
+
image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
|
21 |
+
registry: nvcr.io
|
22 |
+
|
23 |
+
code:
|
24 |
+
local_dir: $CONFIG_DIR/../
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
jobs:
|
29 |
+
- name: infer-eval_suite
|
30 |
+
sku: G$NUM_GPUS
|
31 |
+
preemptible: False
|
32 |
+
command:
|
33 |
+
- . amlt_configs/setup.sh
|
34 |
+
- source ~/.bashrc
|
35 |
+
- pip install pydantic==1.10.8 # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
|
36 |
+
- . amlt_configs/setup_eval_suite.sh
|
37 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
38 |
+
|
39 |
+
# get best (or max step) model
|
40 |
+
- BEST_CKPT_STEP=$$(python scripts/tools/get_model_name_from_trainer_state.py $$AMLT_MAP_INPUT_DIR "best")
|
41 |
+
- BEST_CKPT_PATH=$$(find $$AMLT_MAP_INPUT_DIR -name '*checkpoint*' | grep $$BEST_CKPT_STEP | tail -n1)
|
42 |
+
|
43 |
+
# caption
|
44 |
+
- DATASET=vg-densecap-region_descriptions
|
45 |
+
- >-
|
46 |
+
accelerate launch $SHARED_CMD_ARGS
|
47 |
+
train_data=[$$DATASET]
|
48 |
+
eval_data=[$$DATASET]
|
49 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
|
50 |
+
model.model_name_or_path=$$BEST_CKPT_PATH
|
51 |
+
model.lm_head_model_name_or_path=$$(python scripts/tools/get_sub_model_name_from_ckpt.py $$BEST_CKPT_PATH "lm")
|
52 |
+
model.sam_model_name_or_path=facebook/sam-vit-huge
|
53 |
+
$EXTRA_ARGS
|
54 |
+
|
55 |
+
- bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-vg-densecap-region_descriptions/region_img_annot_caption/visual_genome.py-region_descriptions_v1.2.0-test.region_img.tsv
|
56 |
+
|
57 |
+
# - DATASET=refcocog-google
|
58 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
59 |
+
# - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-refcocog-google/region_img_annot_caption/refcoco.py-refcocog-google-validation.region_img.tsv
|
60 |
+
|
61 |
+
# - DATASET=refcoco-unc-split_testA
|
62 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
63 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
64 |
+
|
65 |
+
# - DATASET=refcoco-unc-split_testB
|
66 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
67 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
68 |
+
|
69 |
+
# - DATASET=refcoco+-unc-split_testA
|
70 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
71 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
72 |
+
|
73 |
+
# - DATASET=refcoco+-unc-split_testB
|
74 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
75 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
76 |
+
|
77 |
+
# concept
|
78 |
+
# - DATASET=coco-instance
|
79 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
80 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
81 |
+
|
82 |
+
# OOM and every slow
|
83 |
+
# - DATASET=objects365-local
|
84 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
85 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
86 |
+
|
87 |
+
# OOM and every slow
|
88 |
+
# - DATASET=v3det-local
|
89 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
90 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
91 |
+
|
92 |
+
submit_args:
|
93 |
+
env:
|
94 |
+
SHARED_MEMORY_PERCENT: 0.5
|
95 |
+
container_args:
|
96 |
+
shm_size: 256g
|
amlt_configs/infer-sca-eval_suite-vg-last_model.yaml
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
SHARED_CMD_ARGS: >-
|
4 |
+
-m src.train
|
5 |
+
+model=base_sca
|
6 |
+
training.do_train=False
|
7 |
+
training.do_eval=False
|
8 |
+
training.do_inference=True
|
9 |
+
training.fp16=True
|
10 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
11 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
12 |
+
wandb.log=False
|
13 |
+
training.dataloader_num_workers=4
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
environment:
|
19 |
+
|
20 |
+
image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
|
21 |
+
registry: nvcr.io
|
22 |
+
|
23 |
+
code:
|
24 |
+
local_dir: $CONFIG_DIR/../
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
jobs:
|
29 |
+
- name: infer-eval_suite
|
30 |
+
sku: G$NUM_GPUS
|
31 |
+
preemptible: False
|
32 |
+
command:
|
33 |
+
- . amlt_configs/setup.sh
|
34 |
+
- source ~/.bashrc
|
35 |
+
- pip install pydantic==1.10.8 # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
|
36 |
+
- . amlt_configs/setup_eval_suite.sh
|
37 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
38 |
+
|
39 |
+
# get best (or max step) model
|
40 |
+
- BEST_CKPT_STEP=$$(python scripts/tools/get_model_name_from_trainer_state.py $$AMLT_MAP_INPUT_DIR "last")
|
41 |
+
- BEST_CKPT_PATH=$$(find $$AMLT_MAP_INPUT_DIR -name '*checkpoint*' | grep $$BEST_CKPT_STEP | tail -n1)
|
42 |
+
|
43 |
+
# caption
|
44 |
+
- DATASET=vg-densecap-region_descriptions
|
45 |
+
- >-
|
46 |
+
accelerate launch $SHARED_CMD_ARGS
|
47 |
+
train_data=[$$DATASET]
|
48 |
+
eval_data=[$$DATASET]
|
49 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
|
50 |
+
model.model_name_or_path=$$BEST_CKPT_PATH
|
51 |
+
model.lm_head_model_name_or_path=$$(python scripts/tools/get_sub_model_name_from_ckpt.py $$BEST_CKPT_PATH "lm")
|
52 |
+
model.sam_model_name_or_path=facebook/sam-vit-huge
|
53 |
+
$EXTRA_ARGS
|
54 |
+
|
55 |
+
- bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-vg-densecap-region_descriptions/region_img_annot_caption/visual_genome.py-region_descriptions_v1.2.0-test.region_img.tsv
|
56 |
+
|
57 |
+
# - DATASET=refcocog-google
|
58 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
59 |
+
# - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-refcocog-google/region_img_annot_caption/refcoco.py-refcocog-google-validation.region_img.tsv
|
60 |
+
|
61 |
+
# - DATASET=refcoco-unc-split_testA
|
62 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
63 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
64 |
+
|
65 |
+
# - DATASET=refcoco-unc-split_testB
|
66 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
67 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
68 |
+
|
69 |
+
# - DATASET=refcoco+-unc-split_testA
|
70 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
71 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
72 |
+
|
73 |
+
# - DATASET=refcoco+-unc-split_testB
|
74 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
75 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
76 |
+
|
77 |
+
# concept
|
78 |
+
# - DATASET=coco-instance
|
79 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
80 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
81 |
+
|
82 |
+
# OOM and every slow
|
83 |
+
# - DATASET=objects365-local
|
84 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
85 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
86 |
+
|
87 |
+
# OOM and every slow
|
88 |
+
# - DATASET=v3det-local
|
89 |
+
# - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
|
90 |
+
# - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference
|
91 |
+
|
92 |
+
submit_args:
|
93 |
+
env:
|
94 |
+
SHARED_MEMORY_PERCENT: 0.5
|
95 |
+
container_args:
|
96 |
+
shm_size: 256g
|
amlt_configs/post_process.sh
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
echo "The main process failed, enter post_process.sh"
|
2 |
+
python -c "import time;time.sleep(100000)"
|
amlt_configs/setup.sh
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#/bin/bash
|
2 |
+
|
3 |
+
# Uninstall mlflow
|
4 |
+
# pip uninstall -y mlflow-skinny
|
5 |
+
# pip uninstall -y azureml-mlflow
|
6 |
+
|
7 |
+
nvidia-smi
|
8 |
+
|
9 |
+
# Download azcopy
|
10 |
+
TMP_DIR=tmp/
|
11 |
+
AZCOPY_URL=https://aka.ms/downloadazcopy-v10-linux
|
12 |
+
AZCOPY_TAR_FILE="$TMP_DIR/azcopy-v10-linux.tar.gz"
|
13 |
+
AZCOPY_FILE="$TMP_DIR/azcopy"
|
14 |
+
|
15 |
+
"$AZCOPY_FILE" --version
|
16 |
+
has_azcopy=$?
|
17 |
+
|
18 |
+
if [[ has_azcopy -eq 0 ]]; then
|
19 |
+
echo "azcopy exists"
|
20 |
+
else
|
21 |
+
echo "azcopy does not exist"
|
22 |
+
mkdir -p $TMP_DIR
|
23 |
+
wget $AZCOPY_URL -O $AZCOPY_TAR_FILE
|
24 |
+
file_to_be_extracted="$(tar -tvf $AZCOPY_TAR_FILE | grep -E 'azcopy$' | awk '{print $6}')"
|
25 |
+
tar -zxvf $AZCOPY_TAR_FILE -C "$TMP_DIR" "$file_to_be_extracted"
|
26 |
+
mv $TMP_DIR/$file_to_be_extracted $TMP_DIR
|
27 |
+
rm $AZCOPY_TAR_FILE
|
28 |
+
rmdir "$(dirname $TMP_DIR/$file_to_be_extracted)"
|
29 |
+
chmod 777 $AZCOPY_FILE
|
30 |
+
export PATH=$PATH:$(pwd)/$TMP_DIR
|
31 |
+
echo "export PATH=\$PATH:$(pwd)/$TMP_DIR" >> ~/.bashrc
|
32 |
+
fi
|
33 |
+
|
34 |
+
# Install pip requirements
|
35 |
+
pip install -r requirements.txt
|
36 |
+
echo "export PATH=\$PATH:\$HOME/.local/bin" >> ~/.bashrc
|
37 |
+
export PATH=$PATH:$HOME/.local/bin
|
38 |
+
|
39 |
+
# Add wandb api
|
40 |
+
# ref: https://docs.wandb.ai/guides/track/environment-variables
|
41 |
+
MY_WANDB_API_KEY= 'YOUR_WANDB_API_KEY'
|
42 |
+
export WANDB_API_KEY=$MY_WANDB_API_KEY
|
43 |
+
echo "export WANDB_API_KEY=$MY_WANDB_API_KEY" >> ~/.bashrc
|
44 |
+
|
45 |
+
# Show full error trace from hydra
|
46 |
+
echo "export HYDRA_FULL_ERROR=1" >> ~/.bashrc
|
47 |
+
|
48 |
+
# Change dataset to hg download
|
49 |
+
TARGET_DATASETS_VER="2.13.1"
|
50 |
+
version="$(pip show datasets | grep Version | awk '{print $2}')"
|
51 |
+
if [[ $version == $TARGET_DATASETS_VER ]]; then
|
52 |
+
echo "datasets version is $TARGET_DATASETS_VER, changing it to use azcopy..."
|
53 |
+
pip_package_path="$(pip show datasets | grep Location | awk '{print $2}')"
|
54 |
+
download_file_path="$pip_package_path/datasets/utils/file_utils.py"
|
55 |
+
if [[ -f $download_file_path.bak ]]; then
|
56 |
+
cp $download_file_path.bak $download_file_path
|
57 |
+
fi
|
58 |
+
cp $download_file_path $download_file_path.bak
|
59 |
+
sed -i '609 i\
|
60 |
+
# NOTE(xiaoke): An intrusion to use azcopy to download from Azure blob storage\
|
61 |
+
elif "blob.core.windows.net" in url:\
|
62 |
+
process_id = -1\
|
63 |
+
try:\
|
64 |
+
import torch\
|
65 |
+
if torch.distributed.is_initialized():\
|
66 |
+
process_id = torch.distributed.get_rank()\
|
67 |
+
except ImportError:\
|
68 |
+
logger.warning("no torch found, cannot determine whether is in ddp mode")\
|
69 |
+
except RuntimeError:\
|
70 |
+
logger.warning("torch.distributed is not initialized, cannot determine whether is in ddp mode")\
|
71 |
+
\
|
72 |
+
logger.warning(f"[process {process_id}] Try to use azcopy to download from Azure blob storage")\
|
73 |
+
import subprocess\
|
74 |
+
\
|
75 |
+
has_azcopy = subprocess.run(["azcopy"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).returncode\
|
76 |
+
if has_azcopy != 0:\
|
77 |
+
logger.warning(f"[process {process_id}] azcopy not found, using http_get, which is slow")\
|
78 |
+
http_get(\
|
79 |
+
url,\
|
80 |
+
temp_file,\
|
81 |
+
proxies=proxies,\
|
82 |
+
resume_size=resume_size,\
|
83 |
+
headers=headers,\
|
84 |
+
cookies=cookies,\
|
85 |
+
max_retries=max_retries,\
|
86 |
+
desc=download_desc,\
|
87 |
+
)\
|
88 |
+
else:\
|
89 |
+
logger.warning(f"[process {process_id}] azcopy found, using azcopy")\
|
90 |
+
result = subprocess.run(\
|
91 |
+
["azcopy", "cp", url, temp_file.name],\
|
92 |
+
)\
|
93 |
+
if result.returncode != 0:\
|
94 |
+
raise ConnectionError(\
|
95 |
+
f"azcopy failed with return code {result.returncode}"\
|
96 |
+
)\
|
97 |
+
' $download_file_path
|
98 |
+
else
|
99 |
+
echo "datasets version is NOT $TARGET_DATASETS_VER, not changed"
|
100 |
+
fi
|
101 |
+
|
102 |
+
# For debug
|
103 |
+
sudo apt-get update
|
104 |
+
if [[ $? -ne 0 ]]; then
|
105 |
+
apt-get update
|
106 |
+
fi
|
107 |
+
sudo apt-get install -y tmux htop vim lsof
|
108 |
+
if [[ $? -ne 0 ]]; then
|
109 |
+
apt-get install -y tmux htop vim lsof
|
110 |
+
fi
|
111 |
+
|
112 |
+
# Tmux config
|
113 |
+
curl -L https://raw.githubusercontent.com/hamvocke/dotfiles/master/tmux/.tmux.conf -o - >> ~/.tmux.conf
|
114 |
+
|
115 |
+
# Vim config
|
116 |
+
# Install vim-plug
|
117 |
+
curl -fLo ~/.vim/autoload/plug.vim --create-dirs \
|
118 |
+
https://raw.githubusercontent.com/junegunn/vim-plug/master/plug.vim
|
119 |
+
|
120 |
+
cat << EOF > ~/.vimrc
|
121 |
+
set tabstop=4
|
122 |
+
set shiftwidth=4
|
123 |
+
set expandtab
|
124 |
+
set smartindent
|
125 |
+
set nu
|
126 |
+
set hlsearch
|
127 |
+
set ignorecase
|
128 |
+
set mouse=a
|
129 |
+
|
130 |
+
call plug#begin()
|
131 |
+
Plug 'tpope/vim-surround'
|
132 |
+
Plug 'tpope/vim-commentary'
|
133 |
+
Plug 'davidhalter/jedi-vim'
|
134 |
+
call plug#end()
|
135 |
+
|
136 |
+
let g:jedi#force_py_version = 3 " Force using Python 3
|
137 |
+
EOF
|
138 |
+
vim +'PlugInstall --sync' +qa
|
139 |
+
|
140 |
+
# Install gpustat
|
141 |
+
pip install gpustat
|
142 |
+
|
143 |
+
# echo pwd
|
144 |
+
echo "pwd: $(pwd)"
|
amlt_configs/setup_accelerate_on_azure.sh
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
source ~/.bashrc
|
3 |
+
|
4 |
+
ACCELERATE_CONFIG_PATHS=(amlt_configs/accelerate_config.yaml amlt_configs/accelerate_deepspeed_config.yaml)
|
5 |
+
if [[ -z "$WORLD_SIZE" ]]; then
|
6 |
+
echo "WORLD_SIZE is not set, using 1"
|
7 |
+
WORLD_SIZE=1
|
8 |
+
fi
|
9 |
+
if [[ -z "$NODE_RANK" ]]; then
|
10 |
+
echo "NODE_RANK is not set, using 0"
|
11 |
+
NODE_RANK=0
|
12 |
+
fi
|
13 |
+
NUM_GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
|
14 |
+
((NUM_TOTAL_GPUS = WORLD_SIZE * NUM_GPUS_PER_NODE))
|
15 |
+
|
16 |
+
echo "Setting up accelerate config:"
|
17 |
+
echo "ACCELERATE_CONFIG_PATHS: ${ACCELERATE_CONFIG_PATHS[@]}"
|
18 |
+
echo "NUM_TOTAL_GPUS: $NUM_TOTAL_GPUS"
|
19 |
+
echo "NUM_GPUS_PER_NODE: $NUM_GPUS_PER_NODE"
|
20 |
+
echo "WORLD_SIZE: $WORLD_SIZE"
|
21 |
+
echo "NODE_RANK: $NODE_RANK"
|
22 |
+
echo "MASTER_ADDR: $MASTER_ADDR"
|
23 |
+
echo "MASTER_PORT: $MASTER_PORT"
|
24 |
+
|
25 |
+
function modify_accelerate_config()
|
26 |
+
{
|
27 |
+
local ACCELERATE_CONFIG_PATH=$1
|
28 |
+
if [[ -z "$MASTER_ADDR" ]]; then
|
29 |
+
echo "MASTER_ADDR is not set, using localhost"
|
30 |
+
sed -i 's/main_process_ip.*//g' $ACCELERATE_CONFIG_PATH
|
31 |
+
sed -i 's/main_process_port.*//g' $ACCELERATE_CONFIG_PATH
|
32 |
+
else
|
33 |
+
sed -i 's/main_process_ip.*/main_process_ip: '"$MASTER_ADDR"'/g' $ACCELERATE_CONFIG_PATH
|
34 |
+
sed -i 's/main_process_port.*/main_process_port: '"$MASTER_PORT"'/g' $ACCELERATE_CONFIG_PATH
|
35 |
+
fi
|
36 |
+
|
37 |
+
sed -i 's/num_machines.*/num_machines: '"$WORLD_SIZE"'/g' $ACCELERATE_CONFIG_PATH
|
38 |
+
sed -i 's/machine_rank.*/machine_rank: '"$NODE_RANK"'/g' $ACCELERATE_CONFIG_PATH
|
39 |
+
|
40 |
+
sed -i 's/num_processes.*/num_processes: '"$NUM_TOTAL_GPUS"'/g' $ACCELERATE_CONFIG_PATH
|
41 |
+
|
42 |
+
accelerate env --config_file $ACCELERATE_CONFIG_PATH
|
43 |
+
# accelerate test --config_file $ACCELERATE_CONFIG_PATH # It may cause bug..ValueError: To use a `DataLoader` in `split_batches` mode, the batch size (8) needs to be a round multiple of the number of processes (16).
|
44 |
+
}
|
45 |
+
|
46 |
+
for ACCELERATE_CONFIG_PATH in "${ACCELERATE_CONFIG_PATHS[@]}"; do
|
47 |
+
if [[ -f "$ACCELERATE_CONFIG_PATH" ]]; then
|
48 |
+
echo "ACCELERATE_CONFIG_PATH: $ACCELERATE_CONFIG_PATH exists, modifying it with env variables."
|
49 |
+
modify_accelerate_config $ACCELERATE_CONFIG_PATH
|
50 |
+
else
|
51 |
+
echo "ACCELERATE_CONFIG_PATH: $ACCELERATE_CONFIG_PATH does not exist"
|
52 |
+
fi
|
53 |
+
done
|
amlt_configs/setup_eval_suite.sh
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
source ~/.bashrc
|
3 |
+
|
4 |
+
ORIGINAL_DIR="$(pwd)"
|
5 |
+
git clone --recursive https://github.com/xk-huang/vdtk.git /tmp/vdtk -b dev
|
6 |
+
cd /tmp/vdtk
|
7 |
+
git submodule update --init --recursive
|
8 |
+
|
9 |
+
apt-get update
|
10 |
+
sudo apt-get update
|
11 |
+
apt-get install git-lfs gawk
|
12 |
+
sudo apt-get install git-lfs gawk
|
13 |
+
|
14 |
+
git lfs install
|
15 |
+
git clone https://huggingface.co/xk-huang/vdtk-data
|
16 |
+
# git submodule init && git submodule update
|
17 |
+
|
18 |
+
rsync -avP ./vdtk-data/vdtk .
|
19 |
+
rm -rf vdtk-data
|
20 |
+
|
21 |
+
pip install --upgrade pip
|
22 |
+
pip install -e . POT==0.9.0 # POT=0.9.1 will take up all the memory with tf backend
|
23 |
+
pip install tensorflow==2.12.1 # Just fix one version of tf
|
24 |
+
pip install levenshtein==0.21.1
|
25 |
+
pip install openpyxl==3.1.2
|
26 |
+
|
27 |
+
python3 -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"
|
28 |
+
cd "$ORIGINAL_DIR"
|
amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
|
4 |
+
# NOTE: this kind of string leaded by > will append a new line to the end of the string
|
5 |
+
SHARED_CMD_ARGS: >-
|
6 |
+
-m src.train
|
7 |
+
+model=base_sca_multitask_v2
|
8 |
+
training.do_train=True
|
9 |
+
training.do_eval=True
|
10 |
+
training.do_inference=True
|
11 |
+
+data.streaming=False
|
12 |
+
training.max_eval_samples=800
|
13 |
+
training.max_steps=100000
|
14 |
+
training.fp16=True
|
15 |
+
training.output_dir=$AMLT_OUTPUT_DIR
|
16 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
17 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
18 |
+
training.save_strategy=steps
|
19 |
+
training.save_steps=5000
|
20 |
+
training.save_total_limit=3
|
21 |
+
training.optim=adamw_torch
|
22 |
+
training.evaluate_before_train=True
|
23 |
+
training.per_device_train_batch_size=1
|
24 |
+
training.evaluation_strategy=steps
|
25 |
+
training.eval_steps=5000
|
26 |
+
training.logging_steps=1000
|
27 |
+
training.logging_first_step=True
|
28 |
+
training.dataloader_num_workers=4
|
29 |
+
training.num_masks_per_sample=16
|
30 |
+
wandb.project=$AMLT_EXPERIMENT_NAME
|
31 |
+
wandb.name=$AMLT_JOB_NAME
|
32 |
+
model.num_caption_tokens=8
|
33 |
+
model.additional_num_hidden_layers=12
|
34 |
+
model.num_task_tokens=6
|
35 |
+
training.lr_scheduler_type=cosine
|
36 |
+
model.lm_head_model_name_or_path=gpt2-large
|
37 |
+
training.learning_rate=1e-5
|
38 |
+
training.weight_decay=1e-4
|
39 |
+
training.warmup_steps=200
|
40 |
+
training.warmup_ratio=0.33333333
|
41 |
+
training.compute_metrics=True
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
environment:
|
46 |
+
|
47 |
+
image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
48 |
+
# image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
49 |
+
# image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
|
50 |
+
registry: nvcr.io
|
51 |
+
|
52 |
+
code:
|
53 |
+
local_dir: $CONFIG_DIR/../
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
jobs:
|
58 |
+
- name: gpt2-large
|
59 |
+
preemptible: True
|
60 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}-V100-IB
|
61 |
+
process_count_per_node: 1 # Each node should run 1 process
|
62 |
+
command:
|
63 |
+
- . amlt_configs/setup.sh
|
64 |
+
- source ~/.bashrc
|
65 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
66 |
+
- >-
|
67 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
68 |
+
$SHARED_CMD_ARGS
|
69 |
+
train_data='[vg-densecap-region_descriptions]'
|
70 |
+
eval_data='[vg-densecap-region_descriptions]'
|
71 |
+
model.lm_head_model_name_or_path=gpt2-large
|
72 |
+
$EXTRA_ARGS
|
73 |
+
|
74 |
+
|
75 |
+
submit_args:
|
76 |
+
env:
|
77 |
+
SHARED_MEMORY_PERCENT: 0.5
|
78 |
+
HYDRA_FULL_ERROR: 1
|
79 |
+
# NCCL_IB_DISABLE: 1
|
80 |
+
# NCCL_IBEXT_DISABLE: 1
|
81 |
+
container_args:
|
82 |
+
shm_size: 256g
|
83 |
+
|
84 |
+
- name: open_llama_3b_v2
|
85 |
+
preemptible: True
|
86 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}-V100-IB
|
87 |
+
process_count_per_node: 1 # Each node should run 1 process
|
88 |
+
command:
|
89 |
+
- . amlt_configs/setup.sh
|
90 |
+
- source ~/.bashrc
|
91 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
92 |
+
- >-
|
93 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
94 |
+
$SHARED_CMD_ARGS
|
95 |
+
train_data='[vg-densecap-region_descriptions]'
|
96 |
+
eval_data='[vg-densecap-region_descriptions]'
|
97 |
+
model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
|
98 |
+
training.gradient_checkpointing=true
|
99 |
+
$EXTRA_ARGS
|
100 |
+
|
101 |
+
|
102 |
+
submit_args:
|
103 |
+
env:
|
104 |
+
SHARED_MEMORY_PERCENT: 0.5
|
105 |
+
HYDRA_FULL_ERROR: 1
|
106 |
+
# NCCL_IB_DISABLE: 1
|
107 |
+
# NCCL_IBEXT_DISABLE: 1
|
108 |
+
container_args:
|
109 |
+
shm_size: 256g
|
110 |
+
|
111 |
+
|
112 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=1e-4" \
|
113 |
+
# -t msroctovc -w msroctows --sku=G8-V100 --no-pre \
|
114 |
+
# amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_4 train-sca-ablat-finetune-scale_lr-110423
|
115 |
+
|
116 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=1e-5" \
|
117 |
+
# -t msroctovc -w msroctows --sku=G8-V100 --no-pre \
|
118 |
+
# amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_5 train-sca-ablat-finetune-scale_lr-110423
|
119 |
+
|
120 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-5" \
|
121 |
+
# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
|
122 |
+
# amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_5-1xlr train-sca-ablat-finetune-scale_lr-110423
|
123 |
+
|
124 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4" \
|
125 |
+
# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
|
126 |
+
# amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_4-1xlr train-sca-ablat-finetune-scale_lr-110423
|
amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
|
4 |
+
# NOTE: this kind of string leaded by > will append a new line to the end of the string
|
5 |
+
SHARED_CMD_ARGS: >-
|
6 |
+
-m src.train
|
7 |
+
+model=base_sca_multitask_v2
|
8 |
+
training.do_train=True
|
9 |
+
training.do_eval=True
|
10 |
+
training.do_inference=True
|
11 |
+
+data.streaming=False
|
12 |
+
training.max_eval_samples=800
|
13 |
+
training.max_steps=200000
|
14 |
+
training.fp16=True
|
15 |
+
training.output_dir=$AMLT_OUTPUT_DIR
|
16 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
17 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
18 |
+
training.save_strategy=steps
|
19 |
+
training.save_steps=5000
|
20 |
+
training.save_total_limit=3
|
21 |
+
training.optim=adamw_torch
|
22 |
+
training.evaluate_before_train=True
|
23 |
+
training.per_device_train_batch_size=1
|
24 |
+
training.evaluation_strategy=steps
|
25 |
+
training.eval_steps=5000
|
26 |
+
training.logging_steps=1000
|
27 |
+
training.logging_first_step=True
|
28 |
+
training.dataloader_num_workers=4
|
29 |
+
training.num_masks_per_sample=16
|
30 |
+
wandb.project=$AMLT_EXPERIMENT_NAME
|
31 |
+
wandb.name=$AMLT_JOB_NAME
|
32 |
+
model.num_caption_tokens=8
|
33 |
+
model.additional_num_hidden_layers=12
|
34 |
+
model.num_task_tokens=6
|
35 |
+
training.lr_scheduler_type=cosine
|
36 |
+
model.lm_head_model_name_or_path=gpt2-large
|
37 |
+
training.learning_rate=1e-4
|
38 |
+
training.weight_decay=1e-4
|
39 |
+
training.warmup_steps=200
|
40 |
+
training.warmup_ratio=0.33333333
|
41 |
+
training.compute_metrics=True
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
environment:
|
46 |
+
|
47 |
+
image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
48 |
+
# image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
49 |
+
# image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
|
50 |
+
registry: nvcr.io
|
51 |
+
|
52 |
+
code:
|
53 |
+
local_dir: $CONFIG_DIR/../
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
jobs:
|
58 |
+
- name: gpt2-large
|
59 |
+
preemptible: True
|
60 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
61 |
+
process_count_per_node: 1 # Each node should run 1 process
|
62 |
+
command:
|
63 |
+
- . amlt_configs/setup.sh
|
64 |
+
- source ~/.bashrc
|
65 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
66 |
+
- >-
|
67 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
68 |
+
$SHARED_CMD_ARGS
|
69 |
+
train_data='[vg-densecap-local]'
|
70 |
+
eval_data='[vg-densecap-local]'
|
71 |
+
model.lm_head_model_name_or_path=gpt2-large
|
72 |
+
$EXTRA_ARGS
|
73 |
+
|
74 |
+
submit_args:
|
75 |
+
env:
|
76 |
+
SHARED_MEMORY_PERCENT: 0.5
|
77 |
+
HYDRA_FULL_ERROR: 1
|
78 |
+
container_args:
|
79 |
+
shm_size: 256g
|
80 |
+
|
81 |
+
- name: open_llama_3b_v2
|
82 |
+
preemptible: True
|
83 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
84 |
+
process_count_per_node: 1 # Each node should run 1 process
|
85 |
+
command:
|
86 |
+
- . amlt_configs/setup.sh
|
87 |
+
- source ~/.bashrc
|
88 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
89 |
+
- >-
|
90 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
91 |
+
$SHARED_CMD_ARGS
|
92 |
+
train_data='[vg-densecap-local]'
|
93 |
+
eval_data='[vg-densecap-local]'
|
94 |
+
model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
|
95 |
+
training.gradient_checkpointing=true
|
96 |
+
$EXTRA_ARGS
|
97 |
+
|
98 |
+
submit_args:
|
99 |
+
env:
|
100 |
+
SHARED_MEMORY_PERCENT: 0.5
|
101 |
+
HYDRA_FULL_ERROR: 1
|
102 |
+
container_args:
|
103 |
+
shm_size: 256g
|
104 |
+
|
105 |
+
|
106 |
+
# sing resrch 1x8 no-pre lsj
|
107 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0" -t msrresrchvc -w msrresrchws --sku=G8-V100 --no-pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.resrch-1x8-v100-16g-no_pre.ollm3bv2-large-lsj train-sca-ablat-lsj-scale_lr-110423
|
108 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0" -t msrresrchvc -w msrresrchws --sku=G8-V100 --no-pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-1x8-v100-16g-no_pre.gpt2-large-lsj train-sca-ablat-lsj-scale_lr-110423
|
109 |
+
|
110 |
+
# sing octo 4x8 no-pre lsj
|
111 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr train-sca-ablat-lsj-scale_lr-110423
|
112 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr train-sca-ablat-lsj-scale_lr-110423
|
113 |
+
|
114 |
+
# The maximum scale lr with BS 64: 8e-4 (too big to achieve better)
|
115 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=8e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.resrch-16x4-v100-16g-pre.ollm3bv2-large-lsj-1xlr train-sca-ablat-lsj-scale_lr-110423
|
116 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=8e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-16x4-v100-16g-no_pre.gpt2-large-lsj-1xlr train-sca-ablat-lsj-scale_lr-110423
|
117 |
+
|
118 |
+
# The maximum scale lr with BS 64: 4e-4 (try to achieve better with that from BS 32)
|
119 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.resrch-16x4-v100-16g-pre.ollm3bv2-large-lsj-1xlr-4e_4 train-sca-ablat-lsj-scale_lr-110423
|
120 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-16x4-v100-16g-no_pre.gpt2-large-lsj-1xlr-4e_4 train-sca-ablat-lsj-scale_lr-110423
|
121 |
+
|
122 |
+
# 1x8, 4e-4
|
123 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t itplabrr1cl1 -w resrchvc --sku=G8-V100 --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.rr1-1x8-v100-16g-pre.ollm3bv2-large-lsj-4e_4 train-sca-ablat-lsj-scale_lr-110423
|
124 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t itplabrr1cl1 -w resrchvc --sku=G8-V100 --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.rr1-1x8-v100-16g-pre.gpt2-large-lsj-4e_4 train-sca-ablat-lsj-scale_lr-110423
|
125 |
+
|
126 |
+
# The maximum scale lr with BS 64: 4e-4 (try to achieve better with that from BS 32)
|
127 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :1=`date +"%m%d%y"`.resrch-16x4-v100-16g-pre.ollm3bv2-large-lsj-1xlr-4e_4 train-sca-ablat-lsj-scale_lr-110423
|
128 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msrresrchvc -w msrresrchws --sku=16xG4-V100-IB --pre amlt_configs/train-sca-ablat-lsj-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-16x4-v100-16g-no_pre.gpt2-large-lsj-1xlr-4e_4 train-sca-ablat-lsj-scale_lr-110423
|
amlt_configs/train-sca-ablat-model_arch-103123.yaml
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
|
4 |
+
# NOTE: this kind of string leaded by > will append a new line to the end of the string
|
5 |
+
SHARED_CMD_ARGS: >-
|
6 |
+
-m src.train
|
7 |
+
training.do_train=True
|
8 |
+
training.do_eval=True
|
9 |
+
training.do_inference=True
|
10 |
+
training.max_eval_samples=800
|
11 |
+
training.max_steps=200000
|
12 |
+
training.fp16=True
|
13 |
+
training.output_dir=$AMLT_OUTPUT_DIR
|
14 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
15 |
+
training.save_strategy=steps
|
16 |
+
training.save_steps=5000
|
17 |
+
training.save_total_limit=3
|
18 |
+
training.optim=adamw_torch
|
19 |
+
training.evaluate_before_train=True
|
20 |
+
training.per_device_train_batch_size=1
|
21 |
+
training.evaluation_strategy=steps
|
22 |
+
training.eval_steps=5000
|
23 |
+
training.logging_steps=1000
|
24 |
+
training.logging_first_step=True
|
25 |
+
training.dataloader_num_workers=4
|
26 |
+
training.num_masks_per_sample=16
|
27 |
+
training.lr_scheduler_type=cosine
|
28 |
+
training.learning_rate=1e-4
|
29 |
+
training.weight_decay=1e-4
|
30 |
+
training.warmup_steps=200
|
31 |
+
training.warmup_ratio=0.33333333
|
32 |
+
training.compute_metrics=True
|
33 |
+
wandb.project=$AMLT_EXPERIMENT_NAME
|
34 |
+
wandb.name=$AMLT_JOB_NAME
|
35 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
36 |
+
model.num_task_tokens=6
|
37 |
+
model.lm_head_model_name_or_path=gpt2-large
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
environment:
|
42 |
+
|
43 |
+
image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
44 |
+
# image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
45 |
+
# image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
|
46 |
+
registry: nvcr.io
|
47 |
+
|
48 |
+
code:
|
49 |
+
local_dir: $CONFIG_DIR/../
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
jobs:
|
54 |
+
- name: gpt2-large
|
55 |
+
preemptible: True
|
56 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
57 |
+
process_count_per_node: 1 # Each node should run 1 process
|
58 |
+
command:
|
59 |
+
- . amlt_configs/setup.sh
|
60 |
+
- source ~/.bashrc
|
61 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
62 |
+
- >-
|
63 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
64 |
+
$SHARED_CMD_ARGS
|
65 |
+
train_data='[vg-densecap-local]'
|
66 |
+
eval_data='[vg-densecap-local]'
|
67 |
+
model.lm_head_model_name_or_path=gpt2-large
|
68 |
+
$EXTRA_ARGS
|
69 |
+
|
70 |
+
submit_args:
|
71 |
+
env:
|
72 |
+
SHARED_MEMORY_PERCENT: 0.5
|
73 |
+
HYDRA_FULL_ERROR: 1
|
74 |
+
container_args:
|
75 |
+
shm_size: 256g
|
76 |
+
|
77 |
+
- name: open_llama_3b_v2
|
78 |
+
preemptible: True
|
79 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
80 |
+
process_count_per_node: 1 # Each node should run 1 process
|
81 |
+
command:
|
82 |
+
- . amlt_configs/setup.sh
|
83 |
+
- source ~/.bashrc
|
84 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
85 |
+
- >-
|
86 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
87 |
+
$SHARED_CMD_ARGS
|
88 |
+
train_data='[vg-densecap-local]'
|
89 |
+
eval_data='[vg-densecap-local]'
|
90 |
+
model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
|
91 |
+
training.gradient_checkpointing=true
|
92 |
+
$EXTRA_ARGS
|
93 |
+
|
94 |
+
submit_args:
|
95 |
+
env:
|
96 |
+
SHARED_MEMORY_PERCENT: 0.5
|
97 |
+
HYDRA_FULL_ERROR: 1
|
98 |
+
container_args:
|
99 |
+
shm_size: 256g
|
100 |
+
|
101 |
+
|
102 |
+
# sing resrch 1x8
|
103 |
+
# amlt run amlt_configs/train-sca-ablat-model_arch-103123.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.gpt2-large-mtv2 train-sca-ablat-model_arch-103123 -d "" --extra-args "+model=base_sca_multitask_v2 model.num_caption_tokens=8 model.additional_num_hidden_layers=12" -t msroctovc -w msroctows --sku=G8-V100 --no-pre
|
104 |
+
# amlt run amlt_configs/train-sca-ablat-model_arch-103123.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.gpt2-large-sm train-sca-ablat-model_arch-103123 -d "" --extra-args "+model=base_sca_multitask_split_mixer model.num_caption_tokens=8 model.additional_num_hidden_layers=12" -t msroctovc -w msroctows --sku=G8-V100 --no-pre
|
105 |
+
# sing octo 1x8 v100 16g
|
106 |
+
# amlt run amlt_configs/train-sca-ablat-model_arch-103123.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.gpt2-large-ddv2 train-sca-ablat-model_arch-103123 -d "" --extra-args "+model=base_sca_direct_decoding_v2 model.additional_num_hidden_layers=12" -t msroctovc -w msroctows --sku=G8-V100 --no-pre
|
107 |
+
# amlt run -d "" --extra-args "+model=base_sca_multitask_roi_pool" -t msrresrchvc -w msrresrchws --sku=G8-V100 --no-pre amlt_configs/train-sca-ablat-model_arch-103123.yaml :0=`date +"%m%d%y"`.resrch-1x8-v100-16g-no_pre.gpt2-large-roi_pool train-sca-ablat-model_arch-103123
|
108 |
+
# amlt run -d "" --extra-args "+model=base_sca_multitask_roi_pool model.vl_projector_type=mlp" -t msrresrchvc -w msrresrchws --sku=G8-V100 --no-pre amlt_configs/train-sca-ablat-model_arch-103123.yaml :0=`date +"%m%d%y"`.resrch-1x8-v100-16g-no_pre.gpt2-large-roi_pool-mlp train-sca-ablat-model_arch-103123
|
109 |
+
|
110 |
+
# Caveat:
|
111 |
+
# 1. cannot add two "+model"
|
112 |
+
# 2. base_sca_direct_decoding_v2 cannot add `num_caption_tokens`
|
amlt_configs/train-sca-ablat-sam_size-110423.yaml
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
|
4 |
+
# NOTE: this kind of string leaded by > will append a new line to the end of the string
|
5 |
+
SHARED_CMD_ARGS: >-
|
6 |
+
-m src.train
|
7 |
+
+model=base_sca_multitask_v2
|
8 |
+
training.do_train=True
|
9 |
+
training.do_eval=True
|
10 |
+
training.do_inference=True
|
11 |
+
training.max_eval_samples=800
|
12 |
+
training.max_steps=200000
|
13 |
+
training.fp16=True
|
14 |
+
training.output_dir=$AMLT_OUTPUT_DIR
|
15 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
16 |
+
training.save_strategy=steps
|
17 |
+
training.save_steps=5000
|
18 |
+
training.save_total_limit=3
|
19 |
+
training.optim=adamw_torch
|
20 |
+
training.evaluate_before_train=True
|
21 |
+
training.per_device_train_batch_size=1
|
22 |
+
training.evaluation_strategy=steps
|
23 |
+
training.eval_steps=5000
|
24 |
+
training.logging_steps=1000
|
25 |
+
training.logging_first_step=True
|
26 |
+
training.dataloader_num_workers=4
|
27 |
+
training.num_masks_per_sample=16
|
28 |
+
training.lr_scheduler_type=cosine
|
29 |
+
training.learning_rate=1e-4
|
30 |
+
training.weight_decay=1e-4
|
31 |
+
training.warmup_steps=200
|
32 |
+
training.warmup_ratio=0.33333333
|
33 |
+
training.compute_metrics=True
|
34 |
+
wandb.project=$AMLT_EXPERIMENT_NAME
|
35 |
+
wandb.name=$AMLT_JOB_NAME
|
36 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
37 |
+
model.additional_num_hidden_layers=12
|
38 |
+
model.num_task_tokens=6
|
39 |
+
model.lm_head_model_name_or_path=gpt2-large
|
40 |
+
model.num_caption_tokens=8
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
environment:
|
45 |
+
|
46 |
+
image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
47 |
+
# image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
48 |
+
# image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
|
49 |
+
registry: nvcr.io
|
50 |
+
|
51 |
+
code:
|
52 |
+
local_dir: $CONFIG_DIR/../
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
jobs:
|
57 |
+
- name: gpt2-large
|
58 |
+
preemptible: True
|
59 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
60 |
+
process_count_per_node: 1 # Each node should run 1 process
|
61 |
+
command:
|
62 |
+
- . amlt_configs/setup.sh
|
63 |
+
- source ~/.bashrc
|
64 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
65 |
+
- >-
|
66 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
67 |
+
$SHARED_CMD_ARGS
|
68 |
+
train_data='[vg-densecap-local]'
|
69 |
+
eval_data='[vg-densecap-local]'
|
70 |
+
model.lm_head_model_name_or_path=gpt2-large
|
71 |
+
$EXTRA_ARGS
|
72 |
+
|
73 |
+
submit_args:
|
74 |
+
env:
|
75 |
+
SHARED_MEMORY_PERCENT: 0.5
|
76 |
+
HYDRA_FULL_ERROR: 1
|
77 |
+
container_args:
|
78 |
+
shm_size: 256g
|
79 |
+
|
80 |
+
- name: open_llama_3b_v2
|
81 |
+
preemptible: True
|
82 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
83 |
+
process_count_per_node: 1 # Each node should run 1 process
|
84 |
+
command:
|
85 |
+
- . amlt_configs/setup.sh
|
86 |
+
- source ~/.bashrc
|
87 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
88 |
+
- >-
|
89 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
90 |
+
$SHARED_CMD_ARGS
|
91 |
+
train_data='[vg-densecap-local]'
|
92 |
+
eval_data='[vg-densecap-local]'
|
93 |
+
model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
|
94 |
+
training.gradient_checkpointing=true
|
95 |
+
$EXTRA_ARGS
|
96 |
+
|
97 |
+
submit_args:
|
98 |
+
env:
|
99 |
+
SHARED_MEMORY_PERCENT: 0.5
|
100 |
+
HYDRA_FULL_ERROR: 1
|
101 |
+
container_args:
|
102 |
+
shm_size: 256g
|
103 |
+
|
104 |
+
|
105 |
+
# sing resrch 1x8
|
106 |
+
# amlt run amlt_configs/train-sca-ablat-sam_size-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.sam-vit-huge train-sca-ablat-sam_size-110423 -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-huge" -t msroctovc -w msroctows --sku=G8-V100 --no-pre -y
|
107 |
+
# amlt run amlt_configs/train-sca-ablat-sam_size-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.sam-vit-large train-sca-ablat-sam_size-110423 -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-large" -t msroctovc -w msroctows --sku=G8-V100 --no-pre -y
|
108 |
+
# amlt run amlt_configs/train-sca-ablat-sam_size-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.sam-vit-base train-sca-ablat-sam_size-110423 -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-base" -t msroctovc -w msroctows --sku=G8-V100 --no-pre -y
|
amlt_configs/train-sca-ablat-timm.yaml
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
|
4 |
+
# NOTE: this kind of string leaded by > will append a new line to the end of the string
|
5 |
+
# Use base_sca_multitask_v2
|
6 |
+
# training.lr_scheduler_type=constant
|
7 |
+
SHARED_CMD_ARGS: >-
|
8 |
+
-m src.train
|
9 |
+
+model=base_sca_timm_multitask_v2
|
10 |
+
training.do_train=True
|
11 |
+
training.do_eval=True
|
12 |
+
training.do_inference=True
|
13 |
+
+data.streaming=False
|
14 |
+
training.max_eval_samples=800
|
15 |
+
training.max_steps=200000
|
16 |
+
training.fp16=True
|
17 |
+
training.output_dir=$AMLT_OUTPUT_DIR
|
18 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
19 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
20 |
+
training.save_strategy=steps
|
21 |
+
training.save_steps=5000
|
22 |
+
training.save_total_limit=3
|
23 |
+
training.optim=adamw_torch
|
24 |
+
training.evaluate_before_train=True
|
25 |
+
training.per_device_train_batch_size=1
|
26 |
+
training.evaluation_strategy=steps
|
27 |
+
training.eval_steps=5000
|
28 |
+
training.logging_steps=1000
|
29 |
+
training.logging_first_step=True
|
30 |
+
training.dataloader_num_workers=4
|
31 |
+
training.num_masks_per_sample=16
|
32 |
+
wandb.project=$AMLT_EXPERIMENT_NAME
|
33 |
+
wandb.name=$AMLT_JOB_NAME
|
34 |
+
model.num_caption_tokens=8
|
35 |
+
model.additional_num_hidden_layers=12
|
36 |
+
model.num_task_tokens=6
|
37 |
+
training.lr_scheduler_type=cosine
|
38 |
+
model.lm_head_model_name_or_path=gpt2-large
|
39 |
+
training.learning_rate=1e-4
|
40 |
+
training.weight_decay=1e-4
|
41 |
+
training.warmup_steps=200
|
42 |
+
training.warmup_ratio=0.33333333
|
43 |
+
training.compute_metrics=True
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
environment:
|
48 |
+
|
49 |
+
image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
50 |
+
# image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
51 |
+
# image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
|
52 |
+
registry: nvcr.io
|
53 |
+
|
54 |
+
code:
|
55 |
+
local_dir: $CONFIG_DIR/../
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
jobs:
|
60 |
+
- name: vit_base_patch32_clip_224.openai
|
61 |
+
preemptible: True
|
62 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
63 |
+
process_count_per_node: 1 # Each node should run 1 process
|
64 |
+
command:
|
65 |
+
- . amlt_configs/setup.sh
|
66 |
+
- source ~/.bashrc
|
67 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
68 |
+
- >-
|
69 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
70 |
+
$SHARED_CMD_ARGS
|
71 |
+
train_data='[vg-densecap-local]'
|
72 |
+
eval_data='[vg-densecap-local]'
|
73 |
+
model.sam_model_name_or_path=facebook/sam-vit-base
|
74 |
+
model.timm_vision_name=vit_base_patch32_clip_224.openai
|
75 |
+
$EXTRA_ARGS
|
76 |
+
submit_args:
|
77 |
+
env:
|
78 |
+
SHARED_MEMORY_PERCENT: 0.5
|
79 |
+
HYDRA_FULL_ERROR: 1
|
80 |
+
# NCCL_IB_DISABLE: 1
|
81 |
+
# NCCL_IBEXT_DISABLE: 1
|
82 |
+
container_args:
|
83 |
+
shm_size: 256g
|
84 |
+
|
85 |
+
# TIMM_NAME_LS=(
|
86 |
+
# vit_large_patch14_clip_336.openai
|
87 |
+
# vit_large_patch14_clip_224.datacompxl
|
88 |
+
# eva02_large_patch14_clip_336.merged2b
|
89 |
+
# )
|
90 |
+
# for TIMM_NAME in ${TIMM_NAME_LS[@]}; do
|
91 |
+
# amlt run -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-$(echo $TIMM_NAME | cut -d '_' -f2) model.timm_vision_name=$TIMM_NAME" \
|
92 |
+
# -t itplabrr1cl1 -w resrchvc --no-pre \
|
93 |
+
# amlt_configs/train-sca-ablat-timm.yaml \
|
94 |
+
# :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.$TIMM_NAME \
|
95 |
+
# 112323.train-sca-ablat-timm
|
96 |
+
# done
|
97 |
+
|
98 |
+
# TIMM_NAME_LS=(
|
99 |
+
# vit_large_patch14_clip_336.openai
|
100 |
+
# vit_large_patch14_clip_224.datacompxl
|
101 |
+
# eva02_large_patch14_clip_336.merged2b
|
102 |
+
# )
|
103 |
+
# for TIMM_NAME in ${TIMM_NAME_LS[@]}; do
|
104 |
+
# amlt run -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-$(echo $TIMM_NAME | cut -d '_' -f2) model.timm_vision_name=$TIMM_NAME training.trainable_params='[mask_decoder.additional_transformer,mask_decoder.caption_tokens,task_tokens,language_project,neck,mask_decoder.transformer]'" \
|
105 |
+
# -t itplabrr1cl1 -w resrchvc --no-pre \
|
106 |
+
# amlt_configs/train-sca-ablat-timm.yaml \
|
107 |
+
# :0=`date +"%m%d%y"`.rr1-1x8-v100-16g-no_pre.$TIMM_NAME-tune_sam_xformer \
|
108 |
+
# 112323.train-sca-ablat-timm
|
109 |
+
# done
|
110 |
+
|
111 |
+
# TIMM_NAME_LS=(
|
112 |
+
# vit_large_patch16_224.mae
|
113 |
+
# vit_large_patch14_reg4_dinov2.lvd142m
|
114 |
+
# )
|
115 |
+
# for TIMM_NAME in ${TIMM_NAME_LS[@]}; do
|
116 |
+
# amlt run -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-$(echo $TIMM_NAME | cut -d '_' -f2) model.timm_vision_name=$TIMM_NAME training.trainable_params='[mask_decoder.additional_transformer,mask_decoder.caption_tokens,task_tokens,language_project,neck,mask_decoder.transformer]'" \
|
117 |
+
# -t itplabrr1cl1 -w resrchvc --no-pre \
|
118 |
+
# amlt_configs/train-sca-ablat-timm.yaml \
|
119 |
+
# :0=`date +"%m%d%y"`.rr1-1x8-v100-16g-no_pre.$TIMM_NAME-tune_sam_xformer \
|
120 |
+
# 112323.train-sca-ablat-timm
|
121 |
+
# done
|
122 |
+
|
123 |
+
# TIMM_NAME_LS=(
|
124 |
+
# vit_large_patch14_reg4_dinov2.lvd142m
|
125 |
+
# )
|
126 |
+
# for TIMM_NAME in ${TIMM_NAME_LS[@]}; do
|
127 |
+
# amlt run -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-$(echo $TIMM_NAME | cut -d '_' -f2) model.timm_vision_name=$TIMM_NAME training.trainable_params='[mask_decoder.additional_transformer,mask_decoder.caption_tokens,task_tokens,language_project,neck,mask_decoder.transformer]'" \
|
128 |
+
# -t msroctovc -w msroctows --no-pre \
|
129 |
+
# amlt_configs/train-sca-ablat-timm.yaml \
|
130 |
+
# :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.$TIMM_NAME-tune_sam_xformer \
|
131 |
+
# 112323.train-sca-ablat-timm
|
132 |
+
# done
|
133 |
+
|
134 |
+
# TIMM_NAME_LS=(
|
135 |
+
# eva02_large_patch14_clip_336.merged2b
|
136 |
+
# )
|
137 |
+
# for TIMM_NAME in ${TIMM_NAME_LS[@]}; do
|
138 |
+
# amlt run -d "" --extra-args "model.sam_model_name_or_path=facebook/sam-vit-$(echo $TIMM_NAME | cut -d '_' -f2) model.timm_vision_name=$TIMM_NAME" \
|
139 |
+
# -t msroctovc -w msroctows --no-pre \
|
140 |
+
# amlt_configs/train-sca-ablat-timm.yaml \
|
141 |
+
# :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.$TIMM_NAME \
|
142 |
+
# 112323.train-sca-ablat-timm
|
143 |
+
# done
|
amlt_configs/train-sca-ablat-weak_sup_data.yaml
ADDED
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
|
4 |
+
# NOTE: this kind of string leaded by > will append a new line to the end of the string
|
5 |
+
# Use base_sca_multitask_v2
|
6 |
+
# training.lr_scheduler_type=constant
|
7 |
+
SHARED_CMD_ARGS: >-
|
8 |
+
-m src.train
|
9 |
+
+model=base_sca_multitask_v2
|
10 |
+
training.do_train=True
|
11 |
+
training.do_eval=True
|
12 |
+
training.do_inference=True
|
13 |
+
+data.streaming=False
|
14 |
+
training.max_eval_samples=800
|
15 |
+
training.max_steps=200000
|
16 |
+
training.fp16=True
|
17 |
+
training.output_dir=$AMLT_OUTPUT_DIR
|
18 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
19 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
20 |
+
training.save_strategy=steps
|
21 |
+
training.save_steps=5000
|
22 |
+
training.save_total_limit=3
|
23 |
+
training.optim=adamw_torch
|
24 |
+
training.evaluate_before_train=True
|
25 |
+
training.per_device_train_batch_size=1
|
26 |
+
training.evaluation_strategy=steps
|
27 |
+
training.eval_steps=5000
|
28 |
+
training.logging_steps=1000
|
29 |
+
training.logging_first_step=True
|
30 |
+
training.dataloader_num_workers=4
|
31 |
+
training.num_masks_per_sample=16
|
32 |
+
wandb.project=$AMLT_EXPERIMENT_NAME
|
33 |
+
wandb.name=$AMLT_JOB_NAME
|
34 |
+
model.num_caption_tokens=8
|
35 |
+
model.additional_num_hidden_layers=12
|
36 |
+
model.num_task_tokens=6
|
37 |
+
training.lr_scheduler_type=cosine
|
38 |
+
model.lm_head_model_name_or_path=gpt2-large
|
39 |
+
training.learning_rate=1e-4
|
40 |
+
training.weight_decay=1e-4
|
41 |
+
training.warmup_steps=200
|
42 |
+
training.warmup_ratio=0.33333333
|
43 |
+
training.compute_metrics=True
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
environment:
|
48 |
+
|
49 |
+
image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
50 |
+
# image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
51 |
+
# image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
|
52 |
+
registry: nvcr.io
|
53 |
+
|
54 |
+
code:
|
55 |
+
local_dir: $CONFIG_DIR/../
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
jobs:
|
60 |
+
- name: only-vg
|
61 |
+
preemptible: True
|
62 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
63 |
+
process_count_per_node: 1 # Each node should run 1 process
|
64 |
+
command:
|
65 |
+
- . amlt_configs/setup.sh
|
66 |
+
- source ~/.bashrc
|
67 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
68 |
+
- >-
|
69 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
70 |
+
$SHARED_CMD_ARGS
|
71 |
+
train_data='[vg-densecap-local]'
|
72 |
+
eval_data='[vg-densecap-local]'
|
73 |
+
training.max_steps=100000
|
74 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
|
75 |
+
wandb.name=$$AMLT_JOB_NAME-vg
|
76 |
+
$EXTRA_ARGS
|
77 |
+
submit_args:
|
78 |
+
env:
|
79 |
+
SHARED_MEMORY_PERCENT: 0.5
|
80 |
+
HYDRA_FULL_ERROR: 1
|
81 |
+
# NCCL_IB_DISABLE: 1
|
82 |
+
# NCCL_IBEXT_DISABLE: 1
|
83 |
+
container_args:
|
84 |
+
shm_size: 256g
|
85 |
+
|
86 |
+
- name: first-coco-then-vg
|
87 |
+
preemptible: True
|
88 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
89 |
+
process_count_per_node: 1 # Each node should run 1 process
|
90 |
+
command:
|
91 |
+
- . amlt_configs/setup.sh
|
92 |
+
- source ~/.bashrc
|
93 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
94 |
+
- >-
|
95 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
96 |
+
$SHARED_CMD_ARGS
|
97 |
+
train_data='[coco-instance-task_type_caption-local]'
|
98 |
+
eval_data='[coco-instance-task_type_caption-local]'
|
99 |
+
training.max_steps=100000
|
100 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
|
101 |
+
wandb.name=$$AMLT_JOB_NAME-coco
|
102 |
+
$EXTRA_ARGS
|
103 |
+
- >-
|
104 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
105 |
+
$SHARED_CMD_ARGS
|
106 |
+
train_data='[vg-densecap-local]'
|
107 |
+
eval_data='[vg-densecap-local]'
|
108 |
+
training.max_steps=100000
|
109 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
|
110 |
+
wandb.name=$$AMLT_JOB_NAME-vg
|
111 |
+
model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
|
112 |
+
$EXTRA_ARGS
|
113 |
+
submit_args:
|
114 |
+
env:
|
115 |
+
SHARED_MEMORY_PERCENT: 0.5
|
116 |
+
HYDRA_FULL_ERROR: 1
|
117 |
+
# NCCL_IB_DISABLE: 1
|
118 |
+
# NCCL_IBEXT_DISABLE: 1
|
119 |
+
container_args:
|
120 |
+
shm_size: 256g
|
121 |
+
|
122 |
+
- name: first-v3det-task_type_caption-local-then-vg
|
123 |
+
preemptible: True
|
124 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
125 |
+
process_count_per_node: 1 # Each node should run 1 process
|
126 |
+
command:
|
127 |
+
- . amlt_configs/setup.sh
|
128 |
+
- source ~/.bashrc
|
129 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
130 |
+
- >-
|
131 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
132 |
+
$SHARED_CMD_ARGS
|
133 |
+
$EXTRA_ARGS
|
134 |
+
train_data='[v3det-task_type_caption-local]'
|
135 |
+
eval_data='[coco-instance-task_type_caption-local]'
|
136 |
+
training.max_steps=100000
|
137 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
|
138 |
+
wandb.name=$$AMLT_JOB_NAME-v3det
|
139 |
+
- >-
|
140 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
141 |
+
$SHARED_CMD_ARGS
|
142 |
+
train_data='[vg-densecap-local]'
|
143 |
+
eval_data='[vg-densecap-local]'
|
144 |
+
training.max_steps=100000
|
145 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
|
146 |
+
wandb.name=$$AMLT_JOB_NAME-vg
|
147 |
+
model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
|
148 |
+
$EXTRA_ARGS
|
149 |
+
submit_args:
|
150 |
+
env:
|
151 |
+
SHARED_MEMORY_PERCENT: 0.5
|
152 |
+
HYDRA_FULL_ERROR: 1
|
153 |
+
# NCCL_IB_DISABLE: 1
|
154 |
+
# NCCL_IBEXT_DISABLE: 1
|
155 |
+
container_args:
|
156 |
+
shm_size: 256g
|
157 |
+
|
158 |
+
- name: first-objects365-then-vg
|
159 |
+
preemptible: True
|
160 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
161 |
+
process_count_per_node: 1 # Each node should run 1 process
|
162 |
+
command:
|
163 |
+
- . amlt_configs/setup.sh
|
164 |
+
- source ~/.bashrc
|
165 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
166 |
+
- >-
|
167 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
168 |
+
$SHARED_CMD_ARGS
|
169 |
+
train_data='[objects365-task_type_caption-local]'
|
170 |
+
eval_data='[coco-instance-task_type_caption-local]'
|
171 |
+
training.max_steps=100000
|
172 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
|
173 |
+
wandb.name=$$AMLT_JOB_NAME-objects365
|
174 |
+
$EXTRA_ARGS
|
175 |
+
- >-
|
176 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
177 |
+
$SHARED_CMD_ARGS
|
178 |
+
train_data='[vg-densecap-local]'
|
179 |
+
eval_data='[vg-densecap-local]'
|
180 |
+
training.max_steps=100000
|
181 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
|
182 |
+
wandb.name=$$AMLT_JOB_NAME-vg
|
183 |
+
model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
|
184 |
+
$EXTRA_ARGS
|
185 |
+
submit_args:
|
186 |
+
env:
|
187 |
+
SHARED_MEMORY_PERCENT: 0.5
|
188 |
+
HYDRA_FULL_ERROR: 1
|
189 |
+
# NCCL_IB_DISABLE: 1
|
190 |
+
# NCCL_IBEXT_DISABLE: 1
|
191 |
+
container_args:
|
192 |
+
shm_size: 256g
|
193 |
+
|
194 |
+
|
195 |
+
- name: first-coco-v3det-task_type_caption-local-then-vg
|
196 |
+
preemptible: True
|
197 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
198 |
+
process_count_per_node: 1 # Each node should run 1 process
|
199 |
+
command:
|
200 |
+
- . amlt_configs/setup.sh
|
201 |
+
- source ~/.bashrc
|
202 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
203 |
+
- >-
|
204 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
205 |
+
$SHARED_CMD_ARGS
|
206 |
+
$EXTRA_ARGS
|
207 |
+
train_data='[coco-instance-task_type_caption-local,v3det-task_type_caption-local]'
|
208 |
+
train_data_interleave_probabilities='[117266,183348]'
|
209 |
+
eval_data='[coco-instance-task_type_caption-local]'
|
210 |
+
training.max_steps=100000
|
211 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
|
212 |
+
wandb.name=$$AMLT_JOB_NAME-v3det
|
213 |
+
- >-
|
214 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
215 |
+
$SHARED_CMD_ARGS
|
216 |
+
train_data='[vg-densecap-local]'
|
217 |
+
eval_data='[vg-densecap-local]'
|
218 |
+
training.max_steps=100000
|
219 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
|
220 |
+
wandb.name=$$AMLT_JOB_NAME-vg
|
221 |
+
model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
|
222 |
+
$EXTRA_ARGS
|
223 |
+
submit_args:
|
224 |
+
env:
|
225 |
+
SHARED_MEMORY_PERCENT: 0.5
|
226 |
+
HYDRA_FULL_ERROR: 1
|
227 |
+
# NCCL_IB_DISABLE: 1
|
228 |
+
# NCCL_IBEXT_DISABLE: 1
|
229 |
+
container_args:
|
230 |
+
shm_size: 256g
|
231 |
+
|
232 |
+
- name: first-coco-v3det-objects365-then-vg
|
233 |
+
preemptible: True
|
234 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
235 |
+
process_count_per_node: 1 # Each node should run 1 process
|
236 |
+
command:
|
237 |
+
- . amlt_configs/setup.sh
|
238 |
+
- source ~/.bashrc
|
239 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
240 |
+
- >-
|
241 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
242 |
+
$SHARED_CMD_ARGS
|
243 |
+
$EXTRA_ARGS
|
244 |
+
train_data='[coco-instance-task_type_caption-local,v3det-task_type_caption-local,objects365-task_type_caption-local]'
|
245 |
+
train_data_interleave_probabilities='[117266,183348,1742289]'
|
246 |
+
eval_data='[coco-instance-task_type_caption-local]'
|
247 |
+
training.max_steps=100000
|
248 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
|
249 |
+
wandb.name=$$AMLT_JOB_NAME-v3det
|
250 |
+
- >-
|
251 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
252 |
+
$SHARED_CMD_ARGS
|
253 |
+
train_data='[vg-densecap-local]'
|
254 |
+
eval_data='[vg-densecap-local]'
|
255 |
+
training.max_steps=100000
|
256 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
|
257 |
+
wandb.name=$$AMLT_JOB_NAME-vg
|
258 |
+
model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
|
259 |
+
$EXTRA_ARGS
|
260 |
+
submit_args:
|
261 |
+
env:
|
262 |
+
SHARED_MEMORY_PERCENT: 0.5
|
263 |
+
HYDRA_FULL_ERROR: 1
|
264 |
+
# NCCL_IB_DISABLE: 1
|
265 |
+
# NCCL_IBEXT_DISABLE: 1
|
266 |
+
container_args:
|
267 |
+
shm_size: 256g
|
268 |
+
|
269 |
+
- name: first-coco-objects365-then-vg
|
270 |
+
preemptible: True
|
271 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
272 |
+
process_count_per_node: 1 # Each node should run 1 process
|
273 |
+
command:
|
274 |
+
- . amlt_configs/setup.sh
|
275 |
+
- source ~/.bashrc
|
276 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
277 |
+
- >-
|
278 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
279 |
+
$SHARED_CMD_ARGS
|
280 |
+
$EXTRA_ARGS
|
281 |
+
train_data='[coco-instance-task_type_caption-local,objects365-task_type_caption-local]'
|
282 |
+
train_data_interleave_probabilities='[117266,1742289]'
|
283 |
+
eval_data='[coco-instance-task_type_caption-local]'
|
284 |
+
training.max_steps=100000
|
285 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/stage_1
|
286 |
+
wandb.name=$$AMLT_JOB_NAME-v3det
|
287 |
+
- >-
|
288 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
289 |
+
$SHARED_CMD_ARGS
|
290 |
+
train_data='[vg-densecap-local]'
|
291 |
+
eval_data='[vg-densecap-local]'
|
292 |
+
training.max_steps=100000
|
293 |
+
training.output_dir=$$AMLT_OUTPUT_DIR/stage_2
|
294 |
+
wandb.name=$$AMLT_JOB_NAME-vg
|
295 |
+
model.model_name_or_path=$$(find $$AMLT_OUTPUT_DIR/stage_1 -name 'checkpoint*' | sort | tail -n1)
|
296 |
+
$EXTRA_ARGS
|
297 |
+
submit_args:
|
298 |
+
env:
|
299 |
+
SHARED_MEMORY_PERCENT: 0.5
|
300 |
+
HYDRA_FULL_ERROR: 1
|
301 |
+
# NCCL_IB_DISABLE: 1
|
302 |
+
# NCCL_IBEXT_DISABLE: 1
|
303 |
+
container_args:
|
304 |
+
shm_size: 256g
|
305 |
+
|
306 |
+
|
307 |
+
# sing clusters, both octo and resrch failed
|
308 |
+
# amlt run -d "" \
|
309 |
+
# -t msroctovc -w msroctows --no-pre \
|
310 |
+
# amlt_configs/train-sca-ablat-weak_sup_data.yaml \
|
311 |
+
# 112123.train-sca-ablat-weak_sup_data.octo
|
312 |
+
|
313 |
+
# sing clusters, both octo and resrch failed
|
314 |
+
# amlt run -d "" \
|
315 |
+
# -t msrresrchvc -w msrresrchws --no-pre \
|
316 |
+
# amlt_configs/train-sca-ablat-weak_sup_data.yaml \
|
317 |
+
# 112123.train-sca-ablat-weak_sup_data.resrch
|
318 |
+
|
319 |
+
# amlt run -d "" \
|
320 |
+
# -t itplabrr1cl1 -w resrchvc --no-pre \
|
321 |
+
# amlt_configs/train-sca-ablat-weak_sup_data.yaml \
|
322 |
+
# 112123.train-sca-ablat-weak_sup_data.rr1
|
323 |
+
|
324 |
+
# amlt run -d "" \
|
325 |
+
# -t msroctovc -w msroctows --no-pre \
|
326 |
+
# amlt_configs/train-sca-ablat-weak_sup_data.yaml :first-coco-objects365-then-vg \
|
327 |
+
# 112123.train-sca-ablat-weak_sup_data.rr1
|
amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
|
4 |
+
# NOTE: this kind of string leaded by > will append a new line to the end of the string
|
5 |
+
SHARED_CMD_ARGS: >-
|
6 |
+
-m src.train
|
7 |
+
+model=base_sca_multitask_v2
|
8 |
+
training.do_train=True
|
9 |
+
training.do_eval=True
|
10 |
+
training.do_inference=True
|
11 |
+
+data.streaming=False
|
12 |
+
training.max_eval_samples=800
|
13 |
+
training.max_steps=200000
|
14 |
+
training.fp16=True
|
15 |
+
training.output_dir=$AMLT_OUTPUT_DIR
|
16 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
17 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
18 |
+
training.save_strategy=steps
|
19 |
+
training.save_steps=5000
|
20 |
+
training.save_total_limit=3
|
21 |
+
training.optim=adamw_torch
|
22 |
+
training.evaluate_before_train=True
|
23 |
+
training.per_device_train_batch_size=1
|
24 |
+
training.evaluation_strategy=steps
|
25 |
+
training.eval_steps=5000
|
26 |
+
training.logging_steps=1000
|
27 |
+
training.logging_first_step=True
|
28 |
+
training.dataloader_num_workers=4
|
29 |
+
training.num_masks_per_sample=16
|
30 |
+
wandb.project=$AMLT_EXPERIMENT_NAME
|
31 |
+
wandb.name=$AMLT_JOB_NAME
|
32 |
+
model.num_caption_tokens=8
|
33 |
+
model.additional_num_hidden_layers=12
|
34 |
+
model.num_task_tokens=6
|
35 |
+
training.lr_scheduler_type=cosine
|
36 |
+
model.lm_head_model_name_or_path=gpt2-large
|
37 |
+
training.learning_rate=1e-4
|
38 |
+
training.weight_decay=1e-4
|
39 |
+
training.warmup_steps=200
|
40 |
+
training.warmup_ratio=0.33333333
|
41 |
+
training.compute_metrics=True
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
environment:
|
46 |
+
|
47 |
+
image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
48 |
+
# image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
49 |
+
# image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
|
50 |
+
registry: nvcr.io
|
51 |
+
|
52 |
+
code:
|
53 |
+
local_dir: $CONFIG_DIR/../
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
jobs:
|
58 |
+
- name: gpt2-large
|
59 |
+
preemptible: True
|
60 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
61 |
+
process_count_per_node: 1 # Each node should run 1 process
|
62 |
+
command:
|
63 |
+
- . amlt_configs/setup.sh
|
64 |
+
- source ~/.bashrc
|
65 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
66 |
+
- >-
|
67 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
68 |
+
$SHARED_CMD_ARGS
|
69 |
+
train_data='[objects365-task_type_caption-local]'
|
70 |
+
eval_data='[objects365-task_type_caption-local]'
|
71 |
+
model.lm_head_model_name_or_path=gpt2-large
|
72 |
+
$EXTRA_ARGS
|
73 |
+
|
74 |
+
submit_args:
|
75 |
+
env:
|
76 |
+
SHARED_MEMORY_PERCENT: 0.5
|
77 |
+
HYDRA_FULL_ERROR: 1
|
78 |
+
# NCCL_IB_DISABLE: 1
|
79 |
+
# NCCL_IBEXT_DISABLE: 1
|
80 |
+
container_args:
|
81 |
+
shm_size: 256g
|
82 |
+
|
83 |
+
- name: open_llama_3b_v2
|
84 |
+
preemptible: True
|
85 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}
|
86 |
+
process_count_per_node: 1 # Each node should run 1 process
|
87 |
+
command:
|
88 |
+
- . amlt_configs/setup.sh
|
89 |
+
- source ~/.bashrc
|
90 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
91 |
+
- >-
|
92 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
93 |
+
$SHARED_CMD_ARGS
|
94 |
+
train_data='[objects365-task_type_caption-local]'
|
95 |
+
eval_data='[objects365-task_type_caption-local]'
|
96 |
+
model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
|
97 |
+
training.gradient_checkpointing=true
|
98 |
+
$EXTRA_ARGS
|
99 |
+
|
100 |
+
submit_args:
|
101 |
+
env:
|
102 |
+
SHARED_MEMORY_PERCENT: 0.5
|
103 |
+
HYDRA_FULL_ERROR: 1
|
104 |
+
# NCCL_IB_DISABLE: 1
|
105 |
+
# NCCL_IBEXT_DISABLE: 1
|
106 |
+
container_args:
|
107 |
+
shm_size: 256g
|
108 |
+
|
109 |
+
|
110 |
+
|
111 |
+
|
112 |
+
# sing octo 4x8 no-pre lsj
|
113 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml :1=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr train-sca-pretrain-o365-lsj-scale_lr-110923
|
114 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml :0=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr train-sca-pretrain-o365-lsj-scale_lr-110923
|
115 |
+
|
116 |
+
# amlt run -d "" --extra-args "training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml :1=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.ollm3bv2-large-no_lsj-1xlr train-sca-pretrain-o365-lsj-scale_lr-110923
|
117 |
+
# amlt run -d "" --extra-args "training.learning_rate=4e-4" -t msroctovc -w msroctows --sku=4xG8-V100 --no-pre amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml :0=`date +"%m%d%y"`.octo-4x8-v100-16g-no_pre.gpt2-large-no_lsj-1xlr train-sca-pretrain-o365-lsj-scale_lr-110923
|
118 |
+
|
119 |
+
|
120 |
+
# 4x8(x2)
|
121 |
+
# amlt run -d "" --extra-args "training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
|
122 |
+
# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
|
123 |
+
# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
|
124 |
+
# :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.ollm3bv2-large-no_lsj-1xlr-bs_2 \
|
125 |
+
# train-sca-pretrain-o365-lsj-scale_lr-110923
|
126 |
+
|
127 |
+
# amlt run -d "" --extra-args "training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
|
128 |
+
# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
|
129 |
+
# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
|
130 |
+
# :0=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.gpt2-large-no_lsj-1xlr-bs_2 \
|
131 |
+
# train-sca-pretrain-o365-lsj-scale_lr-110923
|
132 |
+
|
133 |
+
|
134 |
+
# 4x8(x2), 1e-4
|
135 |
+
# amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2" \
|
136 |
+
# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
|
137 |
+
# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
|
138 |
+
# :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.ollm3bv2-large-no_lsj-0xlr-bs_64 \
|
139 |
+
# train-sca-pretrain-o365-lsj-scale_lr-110923
|
140 |
+
|
141 |
+
# resume above due to node 4006 failed
|
142 |
+
# amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2 training.resume_from_checkpoint=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7299942105.68600-95f56dfa-4b13-45bc-8d03-aad354819319/checkpoint-45000" \
|
143 |
+
# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
|
144 |
+
# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
|
145 |
+
# :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.ollm3bv2-large-no_lsj-0xlr-bs_64.resume \
|
146 |
+
# train-sca-pretrain-o365-lsj-scale_lr-110923
|
147 |
+
|
148 |
+
# amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2 /mnt/blob/projects/sca-xiaoke-v3/amlt-results/7299935921.15305-a115d837-dada-4074-b41d-f66e1b187cc1/checkpoint-60000" \
|
149 |
+
# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
|
150 |
+
# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
|
151 |
+
# :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.ollm3bv2-large-no_lsj-0xlr-bs_64.resume.2 \
|
152 |
+
# train-sca-pretrain-o365-lsj-scale_lr-110923
|
153 |
+
|
154 |
+
# amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2" \
|
155 |
+
# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
|
156 |
+
# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
|
157 |
+
# :0=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.gpt2-large-no_lsj-0xlr-bs_64 \
|
158 |
+
# train-sca-pretrain-o365-lsj-scale_lr-110923
|
159 |
+
|
160 |
+
# amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=1" \
|
161 |
+
# -t msroctovc -w msroctows --sku=8xG8-V100 --no-pre \
|
162 |
+
# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
|
163 |
+
# :0=`date +"%m%d%y"`.octo-8x8-v100-32g-pre.gpt2-large-no_lsj-0xlr-bs_64 \
|
164 |
+
# train-sca-pretrain-o365-lsj-scale_lr-110923
|
165 |
+
|
166 |
+
|
167 |
+
# resume above due to node 4006 failed
|
168 |
+
# amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2" \
|
169 |
+
# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
|
170 |
+
# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
|
171 |
+
# :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre-no_ib.ollm3bv2-large-no_lsj-0xlr-bs_64.rerun \
|
172 |
+
# train-sca-pretrain-o365-lsj-scale_lr-110923
|
173 |
+
|
174 |
+
# amlt run -d "" --extra-args "training.learning_rate=1e-4 training.per_device_train_batch_size=2" \
|
175 |
+
# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
|
176 |
+
# amlt_configs/train-sca-pretrain-o365-lsj-scale_lr-110923.yaml \
|
177 |
+
# :1=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre-_ib.ollm3bv2-large-no_lsj-0xlr-bs_64.rerun \
|
178 |
+
# train-sca-pretrain-o365-lsj-scale_lr-110923
|
amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env_defaults:
|
2 |
+
|
3 |
+
|
4 |
+
# NOTE: this kind of string leaded by > will append a new line to the end of the string
|
5 |
+
SHARED_CMD_ARGS: >-
|
6 |
+
-m src.train
|
7 |
+
+model=base_sca_multitask_v2
|
8 |
+
training.do_train=True
|
9 |
+
training.do_eval=True
|
10 |
+
training.do_inference=True
|
11 |
+
+data.streaming=False
|
12 |
+
training.max_eval_samples=800
|
13 |
+
training.max_steps=100000
|
14 |
+
training.fp16=True
|
15 |
+
training.output_dir=$AMLT_OUTPUT_DIR
|
16 |
+
training.output_log_dir=$AMLT_LOGS_DIR
|
17 |
+
model.cache_dir=/mnt/blob/weights/.model.cache/
|
18 |
+
training.save_strategy=steps
|
19 |
+
training.save_steps=5000
|
20 |
+
training.save_total_limit=3
|
21 |
+
training.optim=adamw_torch
|
22 |
+
training.evaluate_before_train=True
|
23 |
+
training.per_device_train_batch_size=1
|
24 |
+
training.evaluation_strategy=steps
|
25 |
+
training.eval_steps=5000
|
26 |
+
training.logging_steps=1000
|
27 |
+
training.logging_first_step=True
|
28 |
+
training.dataloader_num_workers=4
|
29 |
+
training.num_masks_per_sample=16
|
30 |
+
wandb.project=$AMLT_EXPERIMENT_NAME
|
31 |
+
wandb.name=$AMLT_JOB_NAME
|
32 |
+
model.num_caption_tokens=8
|
33 |
+
model.additional_num_hidden_layers=12
|
34 |
+
model.num_task_tokens=6
|
35 |
+
training.lr_scheduler_type=cosine
|
36 |
+
model.lm_head_model_name_or_path=gpt2-large
|
37 |
+
training.learning_rate=1e-4
|
38 |
+
training.weight_decay=1e-4
|
39 |
+
training.warmup_steps=200
|
40 |
+
training.warmup_ratio=0.33333333
|
41 |
+
training.compute_metrics=True
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
environment:
|
46 |
+
|
47 |
+
image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
48 |
+
# image: nvidia/pytorch:23.06-py3 # NCCL on PHLRR4076 cannot initialized successfully
|
49 |
+
# image: nvidia/pytorch:22.12-py3 # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
|
50 |
+
registry: nvcr.io
|
51 |
+
|
52 |
+
code:
|
53 |
+
local_dir: $CONFIG_DIR/../
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
jobs:
|
58 |
+
- name: gpt2-large
|
59 |
+
preemptible: True
|
60 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}-V100-IB
|
61 |
+
process_count_per_node: 1 # Each node should run 1 process
|
62 |
+
command:
|
63 |
+
- . amlt_configs/setup.sh
|
64 |
+
- source ~/.bashrc
|
65 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
66 |
+
- >-
|
67 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
68 |
+
$SHARED_CMD_ARGS
|
69 |
+
train_data='[vg-densecap-region_descriptions]'
|
70 |
+
eval_data='[vg-densecap-region_descriptions]'
|
71 |
+
model.lm_head_model_name_or_path=gpt2-large
|
72 |
+
$EXTRA_ARGS
|
73 |
+
|
74 |
+
|
75 |
+
submit_args:
|
76 |
+
env:
|
77 |
+
SHARED_MEMORY_PERCENT: 0.5
|
78 |
+
HYDRA_FULL_ERROR: 1
|
79 |
+
# NCCL_IB_DISABLE: 1
|
80 |
+
# NCCL_IBEXT_DISABLE: 1
|
81 |
+
container_args:
|
82 |
+
shm_size: 256g
|
83 |
+
|
84 |
+
- name: open_llama_3b_v2
|
85 |
+
preemptible: True
|
86 |
+
sku: ${NUM_NODES}xG${NUM_GPUS}-V100-IB
|
87 |
+
process_count_per_node: 1 # Each node should run 1 process
|
88 |
+
command:
|
89 |
+
- . amlt_configs/setup.sh
|
90 |
+
- source ~/.bashrc
|
91 |
+
- . amlt_configs/setup_accelerate_on_azure.sh
|
92 |
+
- >-
|
93 |
+
accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml
|
94 |
+
$SHARED_CMD_ARGS
|
95 |
+
train_data='[vg-densecap-region_descriptions]'
|
96 |
+
eval_data='[vg-densecap-region_descriptions]'
|
97 |
+
model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
|
98 |
+
training.gradient_checkpointing=true
|
99 |
+
$EXTRA_ARGS
|
100 |
+
|
101 |
+
|
102 |
+
submit_args:
|
103 |
+
env:
|
104 |
+
SHARED_MEMORY_PERCENT: 0.5
|
105 |
+
HYDRA_FULL_ERROR: 1
|
106 |
+
# NCCL_IB_DISABLE: 1
|
107 |
+
# NCCL_IBEXT_DISABLE: 1
|
108 |
+
container_args:
|
109 |
+
shm_size: 256g
|
110 |
+
|
111 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr/checkpoint-100000/ training.max_steps=100000 training.learning_rate=4e-4" \
|
112 |
+
# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
|
113 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
114 |
+
|
115 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4" \
|
116 |
+
# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
|
117 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :1=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.fintune-ollmv2-lr_1e_4-1xlr-lsj train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
118 |
+
|
119 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr/checkpoint-100000/ training.max_steps=100000 training.learning_rate=4e-4" \
|
120 |
+
# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --pre \
|
121 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.resrch-4x8-v100-16g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
122 |
+
|
123 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4" \
|
124 |
+
# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --pre \
|
125 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :1=`date +"%m%d%y"`.resrch-4x8-v100-16g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
126 |
+
|
127 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr/checkpoint-100000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
|
128 |
+
# -t itphcrdellcl1 --vc hcrdell1 --sku=5xG4-V100 --no-pre \
|
129 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.dell-5x4-v100-32g-no_pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_2 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
130 |
+
|
131 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
|
132 |
+
# -t itphcrdellcl1 --vc hcrdell1 --sku=5xG4-V100 --no-pre \
|
133 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :1=`date +"%m%d%y"`.dell-5x4-v100-32g-no_pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_2 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
134 |
+
|
135 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.gpt2-large-lsj-1xlr/checkpoint-100000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
|
136 |
+
# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
|
137 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_2 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
138 |
+
|
139 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111123.ckpt_backup.o365-pretrain-1e_4-1xlr-lsj-100k_steps/train-sca-pretrain-o365-lsj-scale_lr-110923/110923.octo-4x8-v100-16g-no_pre.ollm3bv2-large-lsj-1xlr/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
|
140 |
+
# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
|
141 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :1=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_2 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
142 |
+
|
143 |
+
|
144 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
|
145 |
+
# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
|
146 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_2-o365_1e_4_no_lsj_bs_64 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
147 |
+
|
148 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
|
149 |
+
# -t itplabrr1cl1 -w resrchvc --sku=4xG8-V100 --pre \
|
150 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.rr1-4x8-v100-32g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_2-o365_1e_4_no_lsj_bs_64 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
151 |
+
|
152 |
+
|
153 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111323.backup_ckpts.pretrain-o365-no_lsj-bs_64/train-sca-pretrain-o365-lsj-scale_lr-110923/111223.rr1-4x8-v100-32g-pre.gpt2-large-no_lsj-1xlr-bs_2/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
|
154 |
+
# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
|
155 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :0=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
156 |
+
|
157 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/weights/111323.backup_ckpts.pretrain-o365-no_lsj-bs_64/train-sca-pretrain-o365-lsj-scale_lr-110923/111223.rr1-4x8-v100-32g-pre.ollm3bv2-large-no_lsj-1xlr-bs_2/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
|
158 |
+
# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
|
159 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml :1=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64 train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
160 |
+
|
161 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228594.52312-e52ba12d-9e32-41d4-9630-e8c5d3e47ca0/checkpoint-200000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
|
162 |
+
# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
|
163 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
|
164 |
+
# :1=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k \
|
165 |
+
# train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
166 |
+
|
167 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228580.84789-0b1216d8-79dc-46b3-8ef2-57c112e1bd18/checkpoint-200000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
|
168 |
+
# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
|
169 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
|
170 |
+
# :0=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-gpt2_large-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k \
|
171 |
+
# train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
172 |
+
|
173 |
+
# The o365 ollm3bv2 failed due to devices. try different clusters
|
174 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228594.52312-e52ba12d-9e32-41d4-9630-e8c5d3e47ca0/checkpoint-200000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=1" \
|
175 |
+
# -t msrresrchvc -w msrresrchws --sku=8xG4-V100-IB --pre \
|
176 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
|
177 |
+
# :1=`date +"%m%d%y"`.resrch-8x4-v100-16g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k \
|
178 |
+
# train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
179 |
+
|
180 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228594.52312-e52ba12d-9e32-41d4-9630-e8c5d3e47ca0/checkpoint-200000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=1" \
|
181 |
+
# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
|
182 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
|
183 |
+
# :1=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k \
|
184 |
+
# train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
185 |
+
|
186 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228594.52312-e52ba12d-9e32-41d4-9630-e8c5d3e47ca0/checkpoint-200000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=1" \
|
187 |
+
# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --pre \
|
188 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
|
189 |
+
# :1=`date +"%m%d%y"`.resrch-4x8-v100-16g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k \
|
190 |
+
# train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
191 |
+
|
192 |
+
# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7300228594.52312-e52ba12d-9e32-41d4-9630-e8c5d3e47ca0/checkpoint-200000/ training.max_steps=100000 training.learning_rate=4e-4 training.per_device_train_batch_size=2" \
|
193 |
+
# -t itplabrr1cl1 -w resrchvc --sku=2xG8-V100 --pre \
|
194 |
+
# amlt_configs/train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023.yaml \
|
195 |
+
# :1=`date +"%m%d%y"`.rr1-2x8-v100-32g-pre.fintune-ollmv2-lr_1e_4-1xlr-lsj-bs_32-o365_4e_4_no_lsj_bs_64_200k.2 \
|
196 |
+
# train-sca.finetune_lsj_scale_lr-o365_1e_4_1xlr_lsj.111023
|
data/demo_cases/1000_IM-0003-1001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1000_IM-0003-2001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1000_IM-0003-3001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1001_IM-0004-1001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1001_IM-0004-1002.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1002_IM-0004-1001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1002_IM-0004-2001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1003_IM-0005-2002.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1004_IM-0005-1001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1004_IM-0005-2001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1005_IM-0006-1001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1005_IM-0006-3003.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1006_IM-0007-1001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1006_IM-0007-3003.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1007_IM-0008-1001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1007_IM-0008-2001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1007_IM-0008-3001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1008_IM-0009-2001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1008_IM-0009-4004.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1009_IM-0010-1001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/1009_IM-0010-2001.dcm.png
ADDED
![]() |
Git LFS Details
|
data/demo_cases/100_IM-0002-1001.dcm.png
ADDED
![]() |
Git LFS Details
|