phanerozoic commited on
Commit
47a1863
·
0 Parent(s):

update repository

Browse files
.gitattributes ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ quickstart/data/000002.jpg filter=lfs diff=lfs merge=lfs -text
37
+ quickstart/data/000008.jpg filter=lfs diff=lfs merge=lfs -text
38
+ quickstart/data/000031.jpg filter=lfs diff=lfs merge=lfs -text
39
+ quickstart/data/000058.jpg filter=lfs diff=lfs merge=lfs -text
40
+ quickstart/data/000083.jpg filter=lfs diff=lfs merge=lfs -text
41
+ quickstart/data/000089.jpg filter=lfs diff=lfs merge=lfs -text
42
+ quickstart/data/000191.jpg filter=lfs diff=lfs merge=lfs -text
43
+ quickstart/data/000400.jpg filter=lfs diff=lfs merge=lfs -text
44
+ quickstart/data/000436.jpg filter=lfs diff=lfs merge=lfs -text
45
+ quickstart/data/000452.jpg filter=lfs diff=lfs merge=lfs -text
46
+ quickstart/data/000496.jpg filter=lfs diff=lfs merge=lfs -text
47
+ quickstart/data/000557.jpg filter=lfs diff=lfs merge=lfs -text
48
+ quickstart/data/000575.jpg filter=lfs diff=lfs merge=lfs -text
49
+ quickstart/data/000591.jpg filter=lfs diff=lfs merge=lfs -text
50
+ quickstart/data/000600.jpg filter=lfs diff=lfs merge=lfs -text
51
+ quickstart/data/000665.jpg filter=lfs diff=lfs merge=lfs -text
52
+ quickstart/data/000696.jpg filter=lfs diff=lfs merge=lfs -text
53
+ quickstart/data/000773.jpg filter=lfs diff=lfs merge=lfs -text
54
+ quickstart/data/000781.jpg filter=lfs diff=lfs merge=lfs -text
55
+ quickstart/data/000793.jpg filter=lfs diff=lfs merge=lfs -text
56
+ quickstart/data/000868.jpg filter=lfs diff=lfs merge=lfs -text
57
+ quickstart/data/000880.jpg filter=lfs diff=lfs merge=lfs -text
58
+ quickstart/data/000889.jpg filter=lfs diff=lfs merge=lfs -text
59
+ quickstart/data/000939.jpg filter=lfs diff=lfs merge=lfs -text
60
+ quickstart/data/000957.jpg filter=lfs diff=lfs merge=lfs -text
61
+ quickstart/data/000998.jpg filter=lfs diff=lfs merge=lfs -text
62
+ quickstart/data/001057.jpg filter=lfs diff=lfs merge=lfs -text
63
+ quickstart/data/001078.jpg filter=lfs diff=lfs merge=lfs -text
64
+ quickstart/data/001118.jpg filter=lfs diff=lfs merge=lfs -text
65
+ quickstart/data/001191.jpg filter=lfs diff=lfs merge=lfs -text
66
+ quickstart/data/001289.jpg filter=lfs diff=lfs merge=lfs -text
67
+ quickstart/data/001348.jpg filter=lfs diff=lfs merge=lfs -text
68
+ quickstart/data/001394.jpg filter=lfs diff=lfs merge=lfs -text
69
+ quickstart/data/001430.jpg filter=lfs diff=lfs merge=lfs -text
70
+ quickstart/data/001586.jpg filter=lfs diff=lfs merge=lfs -text
71
+ quickstart/data/001587.jpg filter=lfs diff=lfs merge=lfs -text
72
+ quickstart/data/001599.jpg filter=lfs diff=lfs merge=lfs -text
73
+ quickstart/data/001624.jpg filter=lfs diff=lfs merge=lfs -text
74
+ quickstart/data/001631.jpg filter=lfs diff=lfs merge=lfs -text
75
+ quickstart/data/001634.jpg filter=lfs diff=lfs merge=lfs -text
76
+ quickstart/data/001685.jpg filter=lfs diff=lfs merge=lfs -text
77
+ quickstart/data/001741.jpg filter=lfs diff=lfs merge=lfs -text
78
+ quickstart/data/001763.jpg filter=lfs diff=lfs merge=lfs -text
79
+ quickstart/data/001851.jpg filter=lfs diff=lfs merge=lfs -text
80
+ quickstart/data/001934.jpg filter=lfs diff=lfs merge=lfs -text
81
+ quickstart/data/001949.jpg filter=lfs diff=lfs merge=lfs -text
82
+ quickstart/data/001951.jpg filter=lfs diff=lfs merge=lfs -text
83
+ quickstart/data/001983.jpg filter=lfs diff=lfs merge=lfs -text
84
+ quickstart/data/002022.jpg filter=lfs diff=lfs merge=lfs -text
85
+ quickstart/data/002070.jpg filter=lfs diff=lfs merge=lfs -text
86
+ quickstart/data/002086.jpg filter=lfs diff=lfs merge=lfs -text
87
+ quickstart/data/002186.jpg filter=lfs diff=lfs merge=lfs -text
88
+ quickstart/data/002284.jpg filter=lfs diff=lfs merge=lfs -text
89
+ quickstart/data/002334.jpg filter=lfs diff=lfs merge=lfs -text
90
+ quickstart/data/002450.jpg filter=lfs diff=lfs merge=lfs -text
91
+ quickstart/data/002468.jpg filter=lfs diff=lfs merge=lfs -text
92
+ quickstart/data/002489.jpg filter=lfs diff=lfs merge=lfs -text
93
+ quickstart/data/002497.jpg filter=lfs diff=lfs merge=lfs -text
94
+ quickstart/data/002514.jpg filter=lfs diff=lfs merge=lfs -text
95
+ quickstart/data/002538.jpg filter=lfs diff=lfs merge=lfs -text
96
+ quickstart/data/002553.jpg filter=lfs diff=lfs merge=lfs -text
97
+ quickstart/data/002586.jpg filter=lfs diff=lfs merge=lfs -text
98
+ quickstart/data/002592.jpg filter=lfs diff=lfs merge=lfs -text
99
+ quickstart/data/002598.jpg filter=lfs diff=lfs merge=lfs -text
100
+ quickstart/data/002640.jpg filter=lfs diff=lfs merge=lfs -text
101
+ quickstart/data/002660.jpg filter=lfs diff=lfs merge=lfs -text
102
+ quickstart/data/002671.jpg filter=lfs diff=lfs merge=lfs -text
103
+ quickstart/data/002799.jpg filter=lfs diff=lfs merge=lfs -text
104
+ quickstart/data/002905.jpg filter=lfs diff=lfs merge=lfs -text
105
+ quickstart/data/002939.jpg filter=lfs diff=lfs merge=lfs -text
106
+ quickstart/data/002953.jpg filter=lfs diff=lfs merge=lfs -text
107
+ quickstart/data/003084.jpg filter=lfs diff=lfs merge=lfs -text
108
+ quickstart/data/003132.jpg filter=lfs diff=lfs merge=lfs -text
109
+ quickstart/data/003148.jpg filter=lfs diff=lfs merge=lfs -text
110
+ quickstart/data/003254.jpg filter=lfs diff=lfs merge=lfs -text
111
+ quickstart/data/003344.jpg filter=lfs diff=lfs merge=lfs -text
112
+ quickstart/data/003391.jpg filter=lfs diff=lfs merge=lfs -text
113
+ quickstart/data/003420.jpg filter=lfs diff=lfs merge=lfs -text
114
+ quickstart/data/003502.jpg filter=lfs diff=lfs merge=lfs -text
115
+ quickstart/data/003541.jpg filter=lfs diff=lfs merge=lfs -text
116
+ quickstart/data/003614.jpg filter=lfs diff=lfs merge=lfs -text
117
+ quickstart/data/003665.jpg filter=lfs diff=lfs merge=lfs -text
118
+ quickstart/data/003713.jpg filter=lfs diff=lfs merge=lfs -text
119
+ quickstart/data/003754.jpg filter=lfs diff=lfs merge=lfs -text
120
+ quickstart/data/003805.jpg filter=lfs diff=lfs merge=lfs -text
121
+ quickstart/data/003871.jpg filter=lfs diff=lfs merge=lfs -text
122
+ quickstart/data/003880.jpg filter=lfs diff=lfs merge=lfs -text
123
+ quickstart/data/003911.jpg filter=lfs diff=lfs merge=lfs -text
124
+ quickstart/data/003978.jpg filter=lfs diff=lfs merge=lfs -text
125
+ quickstart/data/004039.jpg filter=lfs diff=lfs merge=lfs -text
126
+ quickstart/data/004066.jpg filter=lfs diff=lfs merge=lfs -text
127
+ quickstart/data/004082.jpg filter=lfs diff=lfs merge=lfs -text
128
+ quickstart/data/004096.jpg filter=lfs diff=lfs merge=lfs -text
129
+ quickstart/data/004131.jpg filter=lfs diff=lfs merge=lfs -text
130
+ quickstart/data/004170.jpg filter=lfs diff=lfs merge=lfs -text
131
+ quickstart/data/004172.jpg filter=lfs diff=lfs merge=lfs -text
132
+ quickstart/data/004263.jpg filter=lfs diff=lfs merge=lfs -text
133
+ quickstart/data/004304.jpg filter=lfs diff=lfs merge=lfs -text
134
+ quickstart/data/004315.jpg filter=lfs diff=lfs merge=lfs -text
135
+ quickstart/data/004329.jpg filter=lfs diff=lfs merge=lfs -text
136
+ quickstart/data/004371.jpg filter=lfs diff=lfs merge=lfs -text
137
+ quickstart/data/004431.jpg filter=lfs diff=lfs merge=lfs -text
138
+ quickstart/data/004510.jpg filter=lfs diff=lfs merge=lfs -text
139
+ quickstart/data/004514.jpg filter=lfs diff=lfs merge=lfs -text
140
+ quickstart/data/004517.jpg filter=lfs diff=lfs merge=lfs -text
141
+ quickstart/data/004525.jpg filter=lfs diff=lfs merge=lfs -text
142
+ quickstart/data/004535.jpg filter=lfs diff=lfs merge=lfs -text
143
+ quickstart/data/004546.jpg filter=lfs diff=lfs merge=lfs -text
144
+ quickstart/data/004548.jpg filter=lfs diff=lfs merge=lfs -text
145
+ quickstart/data/004557.jpg filter=lfs diff=lfs merge=lfs -text
146
+ quickstart/data/004585.jpg filter=lfs diff=lfs merge=lfs -text
147
+ quickstart/data/004590.jpg filter=lfs diff=lfs merge=lfs -text
148
+ quickstart/data/004610.jpg filter=lfs diff=lfs merge=lfs -text
149
+ quickstart/data/004627.jpg filter=lfs diff=lfs merge=lfs -text
150
+ quickstart/data/004651.jpg filter=lfs diff=lfs merge=lfs -text
151
+ quickstart/data/004656.jpg filter=lfs diff=lfs merge=lfs -text
152
+ quickstart/data/004702.jpg filter=lfs diff=lfs merge=lfs -text
153
+ quickstart/data/004743.jpg filter=lfs diff=lfs merge=lfs -text
154
+ quickstart/data/004755.jpg filter=lfs diff=lfs merge=lfs -text
155
+ quickstart/data/004775.jpg filter=lfs diff=lfs merge=lfs -text
156
+ quickstart/data/004781.jpg filter=lfs diff=lfs merge=lfs -text
157
+ quickstart/data/004831.jpg filter=lfs diff=lfs merge=lfs -text
158
+ quickstart/data/004852.jpg filter=lfs diff=lfs merge=lfs -text
159
+ quickstart/data/004939.jpg filter=lfs diff=lfs merge=lfs -text
160
+ quickstart/data/004978.jpg filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FAIR Noncommercial Research License
2
+
3
+ *v1 Last Updated: August 18, 2025*
4
+
5
+ **"Acceptable Use Policy"** means the FAIR Acceptable Use Policy, applicable to Research Materials, that is incorporated into this Agreement.
6
+
7
+ **"Agreement"** means the terms and conditions for use, reproduction, distribution and modification of the Research Materials set forth herein.
8
+
9
+ **"Documentation"** means the specifications, manuals and documentation accompanying
10
+ Research Materials distributed by Meta.
11
+
12
+ **"Licensee"** or **"you"** means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity's behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
13
+
14
+ **"Meta"** or **"we"** means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
15
+
16
+ **"Noncommercial Research Uses"** means noncommercial research use cases related to research, development, education, processing, or analysis and in each case, is not primarily intended for commercial advantage or monetary compensation to you or others.
17
+
18
+ **"Research Materials"** means, collectively, Documentation and the models, software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code, demonstration materials and other elements of the foregoing distributed by Meta and made available under this Agreement.
19
+
20
+ By clicking "I Accept" below or by using or distributing any portion or element of the Research Materials, you agree to be bound by this Agreement.
21
+
22
+ ## 1. License Rights and Redistribution.
23
+
24
+ a. <ins>Grant of Rights</ins>. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta's intellectual property or other rights owned by Meta embodied in the Research Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Research Materials.
25
+
26
+ b. <ins>Redistribution and Use</ins>.
27
+
28
+ i. You will not use the Research Materials or any outputs or results of the Research Materials in connection with any commercial uses or for any uses other than Noncommercial Research Uses;
29
+
30
+ ii. Distribution of Research Materials, and any derivative works thereof, are subject to the terms of this Agreement. If you distribute or make the Research Materials, or any derivative works thereof, available to a third party, you may only do so under the terms of this Agreement. You shall also provide a copy of this Agreement to such third party.
31
+
32
+ iii. If you submit for publication the results of research you perform on, using, or otherwise in connection with Research Materials, you must acknowledge the use of Research Materials in your publication.
33
+
34
+ iv. Your use of the Research Materials must comply with applicable laws and regulations (including Trade Control Laws) and adhere to the FAIR Acceptable Use Policy, which is hereby incorporated by reference into this Agreement.
35
+
36
+ ## 2. User Support.
37
+
38
+ Your Noncommercial Research Use of the Research Materials is done at your own discretion; Meta does not process any information nor provide any service in relation to such use. Meta is under no obligation to provide any support services for the Research Materials. Any support provided is "as is", "with all faults", and without warranty of any kind.
39
+
40
+ ## 3. Disclaimer of Warranty.
41
+
42
+ UNLESS REQUIRED BY APPLICABLE LAW, THE RESEARCH MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE RESEARCH MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE RESEARCH MATERIALS AND ANY OUTPUT AND RESULTS.
43
+
44
+ ## 4. Limitation of Liability.
45
+
46
+ IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT OR INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
47
+
48
+ ## 5. Intellectual Property.
49
+
50
+ a. Subject to Meta's ownership of Research Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Research Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
51
+
52
+ b. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Research Materials, outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Research Materials.
53
+
54
+ ## 6. Term and Termination.
55
+
56
+ The term of this Agreement will commence upon your acceptance of this Agreement or access to the Research Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Research Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement.
57
+
58
+ ## 7. Governing Law and Jurisdiction.
59
+
60
+ This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
61
+
62
+ ## 8. Modifications and Amendments.
63
+
64
+ Meta may modify this Agreement from time to time; provided that they are similar in spirit to the current version of the Agreement, but may differ in detail to address new problems or concerns. All such changes will be effective immediately. Your continued use of the Research Materials after any modification to this Agreement constitutes your agreement to such modification. Except as provided in this Agreement, no modification or addition to any provision of this Agreement will be binding unless it is in writing and signed by an authorized representative of both you and Meta.
65
+
66
+ ## FAIR Acceptable Use Policy
67
+
68
+ The Fundamental AI Research (FAIR) team at Meta seeks to further understanding of new and existing research domains with the mission of advancing the state-of-the-art in artificial intelligence through open research for the benefit of all.
69
+
70
+ As part of this mission, Meta makes certain research materials available for noncommercial research use. Meta is committed to promoting the safe and responsible use of such research materials.
71
+
72
+ ### Prohibited Uses
73
+
74
+ You agree you will not use, or allow others to use, Research Materials to:
75
+
76
+ Violate the law or others' rights, including to:
77
+ Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as:
78
+ Violence or terrorism
79
+ Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material
80
+ Human trafficking, exploitation, and sexual violence
81
+ The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials.
82
+ Sexual solicitation
83
+ Any other criminal activity
84
+
85
+ Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals
86
+
87
+ Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services
88
+
89
+ Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices
90
+
91
+ Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws
92
+
93
+ Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any technology using FAIR research materials
94
+
95
+ Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system
96
+
97
+ 2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of research artifacts related to the following:
98
+
99
+ Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are subject to the International Traffic Arms Regulations (ITAR) maintained by the United States Department of State
100
+
101
+ Guns and illegal weapons (including weapon development)
102
+
103
+ Illegal drugs and regulated/controlled substances
104
+
105
+ Operation of critical infrastructure, transportation technologies, or heavy machinery
106
+
107
+ Self-harm or harm to others, including suicide, cutting, and eating disorders
108
+
109
+ Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual
110
+
111
+ 3. Intentionally deceive or mislead others, including use of FAIR Research Materials related to the following:
112
+
113
+ Generating, promoting, or furthering fraud or the creation or promotion of disinformation
114
+
115
+ Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content
116
+
117
+ Generating, promoting, or further distributing spam
118
+
119
+ Impersonating another individual without consent, authorization, or legal right
120
+
121
+ Representing that outputs of FAIR research materials or outputs from technology using FAIR research materials are human-generated
122
+
123
+ Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement
124
+
125
+ 4. Fail to appropriately disclose to end users any known dangers of your Research Materials.
126
+
127
+ Please report any violation of this Policy or other problems that could lead to a violation of this Policy by submitting a report [here](https://docs.google.com/forms/d/e/1FAIpQLSeb11cryAopJ7LNrC4nxEUXrHY26hfkXQMf_uH-oFgA3WlYZQ/viewform).
README.md ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: fair-research-license
4
+ license_link: LICENSE
5
+ base_model: facebook/EUPE-ViT-B
6
+ tags:
7
+ - multi-task-perception
8
+ - computer-vision
9
+ - image-classification
10
+ - semantic-segmentation
11
+ - depth-estimation
12
+ - object-detection
13
+ - keypoint-correspondence
14
+ - vision-transformer
15
+ library_name: pytorch
16
+ datasets:
17
+ - imagenet-1k
18
+ - scene_parse_150
19
+ - sayakpaul/nyu_depth_v2
20
+ - detection-datasets/coco
21
+ metrics:
22
+ - accuracy
23
+ - mean_iou
24
+ - mAP
25
+ ---
26
+
27
+ # Argus
28
+
29
+ Argus is a multi-task perception system built on a single frozen vision backbone. One forward pass through the encoder produces classification labels, semantic segmentation masks, metric depth maps, object detections, and dense keypoint correspondences. Roughly 103M parameters total, with the 86M backbone frozen and about 17.3M learnable across five task heads. Named after Argus Panoptes, the many-eyed giant of Greek mythology tasked with watching over everything at once.
30
+
31
+ The backbone is [EUPE-ViT-B](https://huggingface.co/facebook/EUPE-ViT-B), introduced in *Efficient Universal Perception Encoder* (Zhu et al., Meta FAIR, [arXiv:2603.22387](https://arxiv.org/abs/2603.22387), March 2026). EUPE distills a small vision encoder from a collection of larger specialist teachers, producing features that transfer well to image understanding, dense prediction, and vision-language tasks simultaneously. Argus leaves those weights frozen and attaches five lightweight heads.
32
+
33
+ ## Architecture
34
+
35
+ ```
36
+ Image → EUPE-ViT-B (frozen, 86M) → shared features
37
+
38
+ ├── Classification trained linear softmax, 1000 ImageNet classes
39
+ ├── Segmentation BN + 1×1 Conv, 150 ADE20K classes
40
+ ├── Depth DPT multi-scale decoder, metric depth (meters), NYU Depth V2
41
+ ├── Detection split-tower on a multi-scale feature decomposition, 80 COCO classes
42
+ └── Correspondence training-free dense feature matching
43
+ ```
44
+
45
+ | Head | Params | Description |
46
+ |---|---|---|
47
+ | Classification | 769K | `Linear(768, 1000)` softmax on the L2-normalized CLS token |
48
+ | Segmentation | 117K | `BatchNorm2d(768) → Conv2d(768, 150, 1×1)` at stride 16, bilinear-upsampled to input resolution |
49
+ | Depth | 13.45M | DPT fusing backbone blocks [2, 5, 8, 11], 256 depth bins over 0.001 to 10 m |
50
+ | Detection | 2.98M | 5 prediction levels at strides [8, 16, 32, 64, 128], cosine similarity against CLIP ViT-L/14 text embeddings |
51
+ | Correspondence | 0 | cosine-max on backbone spatial features |
52
+
53
+ ## Benchmarks
54
+
55
+ ### EUPE paper reproduction
56
+
57
+ All four reported benchmarks were reproduced as part of building Argus.
58
+
59
+ | Task | Dataset | Metric | Paper | Argus | Delta |
60
+ |---|---|---|---|---|---|
61
+ | Classification | ImageNet-1k | kNN k=10 top-1 | 84.1 | 84.07 | −0.03 |
62
+ | Segmentation | ADE20K | mean IoU | 52.4 | 52.72 | +0.32 |
63
+ | Depth | NYU Depth V2 | RMSE (lower is better) | 0.391 | 0.3914 | +0.0004 |
64
+ | Correspondence | SPair-71k | PCK@0.1 | 51.3 | 54.35 | +3.05 |
65
+
66
+ ### Shipped task metrics
67
+
68
+ | Task | Dataset | Metric | Value |
69
+ |---|---|---|---|
70
+ | Classification | ImageNet-1k val | top-1 / top-5 | 85.53 / 97.69 |
71
+ | Segmentation | ADE20K val | mIoU | 52.72 |
72
+ | Depth | NYU Depth V2 test | RMSE / abs_rel / a1 | 0.480 / 0.219 / 0.872 |
73
+ | Detection | COCO val2017 | mAP @[.5:.95] | 42.64 (42.71 soft NMS) |
74
+ | Correspondence | SPair-71k | PCK@0.1 | 54.35 |
75
+
76
+ The shipped classifier is a trained linear softmax layer (85.53% top-1) that superseded the kNN protocol used during paper reproduction. The shipped depth head is a DPT decoder that improves RMSE by 8% and abs_rel by 28% over a linear probe on the same backbone (0.480 vs 0.520 RMSE).
77
+
78
+ ### Detection detail (COCO val2017)
79
+
80
+ | Metric | Value |
81
+ |---|---|
82
+ | mAP@[0.5:0.95] | **42.64** |
83
+ | mAP@0.50 | 65.70 |
84
+ | mAP@0.75 | 45.10 |
85
+ | mAP (small / medium / large) | 22.31 / 48.33 / 62.90 |
86
+
87
+ At 2.98M learnable parameters the detection head passes the 16.14M FCOS simple-feature-pyramid baseline (41.0 mAP) by +1.64, using 18.4% of its head parameter budget. Small-object mAP is 22.3 against FCOS's 19.4 (+2.9). The backbone was never exposed to detection data; these are the same frozen features used for every other task.
88
+
89
+ Evaluation protocol: per-class hard NMS (IoU 0.5), score threshold 0.05, top-100 detections per image, pycocotools on COCO val2017.
90
+
91
+ The standalone checkpoint and related detection-head work live in [phanerozoic/detection-heads](https://huggingface.co/phanerozoic/detection-heads).
92
+
93
+ ### Cross-Dataset Detection Transfer
94
+
95
+ To test whether the detection head's features generalize beyond COCO, the shipping 2.98M detection head (trained on COCO 2017 at 768px) and the 16.14M FCOS baseline (trained on COCO 2017 at 640px) were each evaluated zero-shot against the 20 RF100-VL validation domains. Both heads saw only COCO during training; RF100-VL was never exposed to either. Evaluation is class-agnostic AR@100 (all detections relabeled to a single "object" class, all ground-truth boxes relabeled likewise) so that localization transfer can be measured even on domains whose label space does not overlap COCO-80.
96
+
97
+ | domain | FCOS (16.1M) | Ours (3.0M) | Δ |
98
+ |---------------------------------------------------------|-------------:|------------:|------:|
99
+ | actions | 37.5 | 39.6 | +2.1 |
100
+ | aerial-airport | 16.1 | 17.3 | +1.1 |
101
+ | all-elements | 2.3 | 7.9 | +5.6 |
102
+ | aquarium-combined | 47.5 | 58.2 | +10.6 |
103
+ | defect-detection | 0.1 | 0.3 | +0.2 |
104
+ | dentalai | 0.2 | 0.9 | +0.7 |
105
+ | flir-camera-objects | 53.1 | 54.3 | +1.2 |
106
+ | gwhd2021 | 1.7 | 1.5 | -0.3 |
107
+ | lacrosse-object-detection | 57.9 | 66.6 | +8.7 |
108
+ | new-defects-in-wood | 5.6 | 14.6 | +9.0 |
109
+ | orionproducts | 17.1 | 25.5 | +8.5 |
110
+ | paper-parts | 19.3 | 22.2 | +2.8 |
111
+ | recode-waste | 11.4 | 11.8 | +0.4 |
112
+ | soda-bottles | 29.6 | 35.8 | +6.3 |
113
+ | the-dreidel-project | 57.7 | 65.2 | +7.4 |
114
+ | trail-camera | 60.1 | 69.6 | +9.5 |
115
+ | water-meter | 0.7 | 0.0 | -0.6 |
116
+ | wb-prova | 83.6 | 86.2 | +2.6 |
117
+ | wildfire-smoke | 0.3 | 0.5 | +0.2 |
118
+ | x-ray-id | 0.0 | 0.0 | 0.0 |
119
+ | **RF100-VL AR@100 mean** | **25.1** | **28.9** | **+3.8** |
120
+ | **Domain wins** | **3** | **17** | |
121
+
122
+ The detection head wins 17 of 20 domains, loses 3, with mean AR@100 +3.8 over the 5× larger FCOS baseline. The largest gaps are on domains far from COCO's distribution: aquarium-combined (+10.6), trail-camera (+9.5), new-defects-in-wood (+9.0), lacrosse-object-detection (+8.7), orionproducts (+8.5), the-dreidel-project (+7.4), soda-bottles (+6.3), all-elements (+5.6). The three losses are small (≤0.6 AR) on domains with very low absolute AR for both heads (gwhd2021 wheat-head crops, water-meter digit reads, x-ray-id anatomical landmarks). The interpretation is that the backbone's multi-teacher distilled features produce representations general enough that a frozen head one-fifth the FCOS size transfers across wildly different visual domains at the same level or better.
123
+
124
+ ### Cross-Dataset Segmentation Transfer
125
+
126
+ A separate BN+1×1 linear probe with the same training recipe as the ADE20K head, on the frozen backbone. The backbone was never exposed to driving scenes during EUPE distillation or Argus head training.
127
+
128
+ | Dataset | Classes | Train images | mIoU |
129
+ |---|---|---|---|
130
+ | ADE20K (shipped head) | 150 | 20,210 | 52.72 |
131
+ | Cityscapes (transfer probe) | 19 | 2,975 | 63.76 |
132
+
133
+ The Cityscapes probe scores road 96.4, car 87.9, sky 88.8, building 86.7, vegetation 85.6. The weaker categories are thin vertical structures (pole 17.8, traffic light 36.4, traffic sign 48.3), which is an inherent resolution limitation of the stride-16 patch grid rather than a deficiency in the learned representation.
134
+
135
+ ## Comparison with standard baselines
136
+
137
+ As a sanity check, Argus was compared against several well-known models on the same 200-image COCO subset. The classification comparison uses a keyword cross-reference between each model's top-k ImageNet predictions and the COCO ground-truth detection labels on those images, which provides a consistent yardstick across differently-trained models despite the label-space mismatch. **These hit rates measure agreement with COCO detection labels via keyword matching on the 200-image subset; they are not raw ImageNet accuracy.** For reference, all three classifiers exceed 80% top-1 on the full ImageNet validation set.
138
+
139
+ **Classification** (hit rate against COCO detection labels, 200 images):
140
+
141
+ | Model | Parameters | Top-1 hit | Top-5 hit | Latency | Peak VRAM |
142
+ |--------------------|------------|-----------|-----------|---------|-----------|
143
+ | Argus (EUPE-ViT-B) | 86 M | 42.2% | 66.8% | 13.1 ms | 0.34 GB |
144
+ | ConvNeXt-Base | 89 M | 40.2% | 71.4% | 10.4 ms | 0.35 GB |
145
+ | ResNet50 | 26 M | 36.2% | 61.8% | 8.4 ms | 0.12 GB |
146
+
147
+ **Segmentation**:
148
+
149
+ | Model | Parameters | Classes | Latency | Peak VRAM |
150
+ |----------------------------|------------|---------|---------|-----------|
151
+ | Argus (EUPE + linear head) | 86 M | 150 | 11.8 ms | 0.41 GB |
152
+ | DeepLabV3-ResNet50 | 42 M | 21 | 15.9 ms | 0.33 GB |
153
+
154
+ **Depth**:
155
+
156
+ | Model | Parameters | Latency | Peak VRAM |
157
+ |----------------------------|------------|---------|-----------|
158
+ | Argus (EUPE + linear head) | 86 M | 13.3 ms | 0.35 GB |
159
+ | Depth-Anything-V2-Base | 98 M | 18.8 ms | 0.68 GB |
160
+
161
+ Argus produces the top-1 classification accuracy of the three image classifiers, with ConvNeXt-Base edging it slightly on top-5. The Argus classification row above was measured with the kNN method during the original head-to-head comparison; the current shipped classifier (trained linear softmax) would widen the top-5 margin. Argus is faster than DeepLabV3 while predicting a much richer label space, and it is faster than Depth-Anything-V2 while using roughly half the VRAM. Although these baselines and Argus were trained for different objectives on different datasets, the comparison is useful for understanding what the model delivers in practice.
162
+
163
+ ### Multi-Task Throughput
164
+
165
+ The per-task comparisons above measure each head against its single-task counterpart in isolation. A separate question is what happens when a user needs all of the tasks at once, which is the typical situation in dataset annotation, model evaluation, and any pipeline where images pass through multiple analysis stages in sequence. The alternative to Argus in that situation is to load and run four separate single-task models of comparable quality, each carrying its own backbone, its own preprocessing, and its own forward pass. The total cost is the sum of the four individual inference times, plus the memory overhead of holding four independent models on the device simultaneously.
166
+
167
+ The models chosen for this comparison were selected to match the quality tier of the EUPE-ViT-B backbone rather than to minimize size or maximize speed. ConvNeXt-Base (88.6M parameters) is a widely-used ImageNet-1k classifier at the same parameter scale as EUPE-ViT-B. SegFormer-B3 (47.3M) is a transformer-based ADE20K semantic segmenter that is the standard mid-range alternative to a linear probe on a frozen backbone. Depth-Anything-V2-Base (97.5M) is the current standard for single-image monocular depth estimation at base scale. YOLO26l (26.3M) is the large variant of the January 2026 YOLO release from Ultralytics, representing the state of the art in efficient real-time detection. All measurements were taken on an NVIDIA RTX 6000 Ada across the same nine example images, with five timed runs after a three-image warmup pass to eliminate cold-start effects.
168
+
169
+ | Pipeline | Parameters | Latency per image | Tasks |
170
+ |----------|-----------|-------------------|-------|
171
+ | Argus unified | 103 M | 56 ms | 5 (classify, segment, depth, detect, correspond) |
172
+ | Four separate models | 260 M | 68 ms | 4 (classify, segment, depth, detect) |
173
+
174
+ The per-model breakdown for the separate pipeline is ConvNeXt-Base at 6 ms, SegFormer-B3 at 19 ms, Depth-Anything-V2-Base at 31 ms, and YOLO26l at 12 ms, summing to 68 ms when the tasks are run sequentially on the same image. Argus completes five tasks (the same four plus keypoint correspondence, which the separate pipeline does not attempt) in 56 ms from a single model load. The total parameter count for the separate pipeline is 260M across four independent weight sets, while Argus carries 103M in a single file.
175
+
176
+ The throughput advantage comes from the shared backbone. Each of the four separate models pays the cost of encoding the image through its own network before producing task-specific output. Argus encodes the image once through EUPE-ViT-B and then routes the resulting features to five lightweight heads, each of which adds only a few milliseconds on top of the shared representation. The backbone forward pass is the dominant cost in both pipelines, and running it once rather than four times is where the 1.2x throughput improvement and 2.5x parameter reduction originate. The practical consequence for deployment is that Argus requires a single model download, a single checkpoint load, and a single Python import, where the equivalent separate-model pipeline requires four downloads totaling over a gigabyte, four independent weight sets held concurrently, and four separate dependency trees to manage.
177
+
178
+ ## Usage
179
+
180
+ ```python
181
+ from PIL import Image
182
+ from transformers import AutoModel
183
+
184
+ model = AutoModel.from_pretrained("phanerozoic/argus", trust_remote_code=True)
185
+ image = Image.open("your_image.jpg").convert("RGB")
186
+
187
+ top5 = model.classify(image, top_k=5)
188
+ seg = model.segment(image) # [H, W] class indices
189
+ depth = model.depth(image) # [H, W] metric depth in meters
190
+ dets = model.detect(image, score_thresh=0.3)
191
+ # dets: list of {"box": [x1, y1, x2, y2], "score", "label", "class_name"}
192
+
193
+ # Three tasks at once (shared backbone forward inside perceive)
194
+ result = model.perceive(image)
195
+ # result["classification"], ["segmentation"], ["depth"], ["timings_ms"]
196
+
197
+ # Keypoint correspondence between two images
198
+ target = Image.open("other_image.jpg").convert("RGB")
199
+ predicted = model.correspond(image, target, [[100, 100], [200, 200]])
200
+ ```
201
+
202
+ Every single-image method also accepts a list of PIL images and returns a list of per-image results in the same shape a single call would produce.
203
+
204
+ ### Confidence outputs
205
+
206
+ ```python
207
+ seg_map, seg_conf = model.segment(image, return_confidence=True)
208
+ # seg_conf is per-pixel max softmax probability in [0, 1]
209
+
210
+ depth_map, depth_std = model.depth(image, return_confidence=True)
211
+ # depth_std is per-pixel standard deviation of the 256-bin distribution
212
+
213
+ result = model.perceive(image, return_confidence=True)
214
+ # result["segmentation_confidence"], result["depth_uncertainty"]
215
+ ```
216
+
217
+ Classification always carries a `margin` field (top-1 minus top-2 score) on the first entry.
218
+
219
+ ### ONNX export
220
+
221
+ ```python
222
+ paths = model.export_onnx("/path/to/out_dir", backbone_resolution=640, verify=True)
223
+ # backbone, classifier, seg_head, depth_head, detection_head (five graphs)
224
+ ```
225
+
226
+ The segmentation graph folds bilinear upsample to input resolution inside the graph, so consumers argmax directly. The classifier graph is self-contained (softmax weights captured as buffers). The depth head accepts four intermediate ViT-block activations as separate positional tensor inputs. The detection head returns pre-NMS per-location boxes and scores by default, or with `include_nms=True` bakes ONNX `NonMaxSuppression` (opset ≥ 10) into the detection graph for single-shot TensorRT or mobile inference. Correspondence has no learned parameters and needs no graph.
227
+
228
+ Tolerance for `verify=True` can be a float or a dict keyed by verification output name. When a float is passed, detection box coordinates get a resolution-scaled tolerance because `exp()` in the regression path amplifies FP kernel-dispatch differences to pixel scale.
229
+
230
+ ### INT8 quantization
231
+
232
+ ```python
233
+ model = AutoModel.from_pretrained("phanerozoic/argus", trust_remote_code=True)
234
+ model = model.cuda().eval().quantize_int8() # requires: pip install torchao
235
+ ```
236
+
237
+ Weight-only INT8 quantization via torchao. Linear weights go to INT8; activations stay in BF16. Classification agreement with FP32 is 100%, depth drift averages 0.013 m. Reduces weight VRAM substantially. Latency behaviour depends on whether the target GPU has an INT8 tensor-core path torchao can dispatch to.
238
+
239
+ ### Precision variants
240
+
241
+ Two safetensors with identical inference behaviour but different on-disk precision.
242
+
243
+ | File | Load |
244
+ |---|---|
245
+ | `model.safetensors` | `AutoModel.from_pretrained("phanerozoic/argus", trust_remote_code=True)` |
246
+ | `model.bf16_backbone.safetensors` | add `variant="bf16_backbone"` |
247
+
248
+ Both load into the same FP32 model in memory; PyTorch upcasts the stored bfloat16 weights at construction. The smaller variant saves download bandwidth only.
249
+
250
+ ## Training
251
+
252
+ The backbone is frozen for every task. Only the task heads are trained; the kNN class prototypes used during paper reproduction were extracted (not trained at all).
253
+
254
+ | Component | Source | Method |
255
+ |---|---|---|
256
+ | Segmentation | ADE20K (20,210 train) | Linear probe, CE loss, AdamW lr 1e-3, 512×512, 40,000 iterations |
257
+ | Depth | NYU Depth V2 (24,231 train) | DPT decoder, SILog loss, AdamW lr 1e-4, 416×416, 38,400 iterations |
258
+ | Linear softmax classifier | ImageNet-1k (1.28M train) | Cached CLS features, SGD momentum 0.9, cosine LR, 100 epochs |
259
+ | Detection | COCO 2017 (117,266 train) | Split-tower on a multi-scale decomposition of frozen features, ATSS, focal + GIoU + BCE, AdamW lr 5e-4, 768×768, 16 ep + 3 ep partial calibration |
260
+ | Correspondence | none | training-free cosine similarity |
261
+
262
+ ### Backbone simplification
263
+
264
+ The upstream EUPE-ViT-B release ships a `LinearKMaskedBias` wrapper around each block's QKV projection. In the released weights both the `bias_mask` and the `bias` are filled with zeros across all twelve blocks, so the masked bias is identically zero at every forward pass. The Argus backbone drops the 24 redundant tensors entirely (12 × `qkv.bias` + 12 × `qkv.bias_mask`, 55,296 values total), and the attention blocks are constructed with `qkv_bias=False, mask_k_bias=False`. FP32 forward is bitwise-equivalent for classification, segmentation, detection, and correspondence. The DPT depth decoder shows sub-centimeter drift under BF16 autocast; the drift is an order of magnitude smaller than the head's own 39-centimeter NYU Depth V2 RMSE and causes no visible change in depth maps. To load the upstream EUPE-ViT-B release directly into this backbone class, pass `strict=False` to `load_state_dict` so the extra keys in the upstream checkpoint are silently ignored.
265
+
266
+ ### Head details
267
+
268
+ **Segmentation.** `BatchNorm2d(768) → Conv2d(768, 150, 1×1)`, 116,886 parameters. Trained at 512×512 with cross-entropy loss, AdamW (lr 1e-3, weight decay 1e-3), WarmupOneCycleLR with 1500-step warmup, batch 16.
269
+
270
+ **Depth (DPT).** Hooks into backbone blocks [2, 5, 8, 11] via PyTorch forward hooks, capturing intermediate representations without modifying the backbone. A reassemble stage projects each block's output from 768 to 256 channels via LayerNorm + Linear, reshapes to spatial grids, and rescales to strides [4, 8, 16, 32]. A bottom-up fusion path combines the four scales through residual conv blocks with skip connections. A final conv head produces 256 depth-bin logits; metric depth is the bin-weighted sum. 13,450,000 parameters. Trained at 416×416 with SILog loss, AdamW (lr 1e-4, weight decay 1e-3), cosine schedule with 3% warmup, batch 16, 38,400 iterations.
271
+
272
+ **Linear softmax classifier.** A single `Linear(768, 1000)` layer with bias applied to the L2-normalized CLS token, 769,000 parameters. Trained as a two-pass job: first the frozen backbone runs over ImageNet-1k train to cache a per-image CLS feature tensor (1,281,167 × 768), then the linear layer trains on the cached features alone with SGD (momentum 0.9, weight decay 0), batch 4096, cosine schedule, 100 epochs, no augmentation. A small LR sweep over {0.5, 1.0, 3.0, 10.0, 30.0} selected lr=30.0: L2-normalized features plus zero-initialized weights require an unusually large learning rate to grow the weight scale to the point where the softmax distribution sharpens. The best run reached 85.53% top-1 and 97.69% top-5 on ImageNet-1k val.
273
+
274
+ **Detection (split-tower on a cofiber decomposition of frozen features).** Anchor-free. The multi-scale decomposition is applied per-channel to the 768-D feature map: 2×2 average pool reduce, bilinear upsample expand, band_k = x_k − U(D(x_k)) with x_{k+1} = D(x_k). Zero parameters, replacing an 11M-parameter FPN. Five prediction levels at strides [8, 16, 32, 64, 128]: four bands at 16, 32, 64, 128, plus a stride-8 level from a single transposed convolution on the stride-16 band. The Rocq/HoTT formalization in phanerozoic/cofiber-detection proves that this is a split short exact sequence in any semi-additive category with an adjoint retraction pair (U, D), with each band equal to ker(D_k). Burt & Adelson's 1983 Laplacian pyramid is the scalar-image instance with Gaussian reduce. Separate classification and regression towers of depth nine (five 3×3 ConvGN blocks followed by four depthwise residual blocks at 160 hidden channels) process each level with weights shared across levels. Top-down lateral connections pass information from coarser to finer bands before the towers run. Classification is cosine similarity between a `Linear(160, 768)` projection and CLIP ViT-L/14 multi-prompt text embeddings of the 80 COCO class names, with a learned scalar temperature and per-class bias. Regression uses exponentiated LTRB distances with a learned per-level scale. Centerness is a single 1×1 convolution. 2,975,067 parameters.
275
+
276
+ Trained at 768×768 with letterbox padding, ATSS target assignment (Zhang et al. 2020), horizontal-flip augmentation, focal loss (α=0.25, γ=2.0) for classification, GIoU for boxes, BCE for centerness, AdamW (lr 5e-4, weight decay 1e-4), cosine schedule with 3% warmup, batch 16, 16 epochs. Step 104,000 of 117,264 was selected by late-training checkpoint sweep as the base. A 3-epoch partial fine-tune at lr 1e-4 then updates only `cls_project`, `cls_bias`, and `logit_scale` (the classification calibration layers), leaving the towers and the decomposition path frozen. The partial fine-tune adds +0.15 aggregate mAP and +1.1 small-object mAP. The shipped weights are the final state of that fine-tune. The standalone detection-head checkpoint is mirrored in the sibling detection-heads repo at [`heads/cofiber_threshold/split_tower_5scale_160h_5std_4dw_ema_l14_16ep_768_cls_calib/checkpoint_final.pth`](https://huggingface.co/phanerozoic/detection-heads/blob/main/heads/cofiber_threshold/split_tower_5scale_160h_5std_4dw_ema_l14_16ep_768_cls_calib/checkpoint_final.pth) with its eval JSON alongside.
277
+
278
+ **Correspondence.** No learned parameters. At inference, dense patch features are extracted from both images, upsampled to 512×512 pixel resolution, and matched by cosine similarity per source keypoint.
279
+
280
+ ### Compute
281
+
282
+ | Task | Iterations | Notes |
283
+ |----------------------------------|-----------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------|
284
+ | Segmentation (ADE20K) | 40,000 | linear probe, batch 16, 512px, CE loss, frozen backbone |
285
+ | Linear classifier (ImageNet-1k) | 100 epochs × 313 steps | SGD momentum 0.9, batch 4096, cosine schedule on cached CLS features; extraction is a single full-train pass through the frozen backbone |
286
+ | DPT depth decoder (NYU Depth V2) | 38,400 iterations | batch 16, 416px, SILog loss, frozen backbone |
287
+ | Detection (COCO 2017) | 16 epochs × 7,329 batches at 768px + 3-epoch partial fine-tune of classification calibration layers | bf16 mixed-precision forward + fp32 master params + fp32 AdamW moments, CUDA graph capture, frozen backbone |
288
+ | Correspondence (SPair-71k) | training-free | |
289
+
290
+ ### Why minimal heads
291
+
292
+ The segmentation and classification heads follow the EUPE paper's evaluation principle: a minimal decoder isolates the backbone's contribution from the head's capacity. A Mask2Former-style segmentation head would produce higher mIoU, but those numbers would reflect the decoder as much as the features. The depth and detection heads are heavier because their tasks require multi-scale reasoning. The decomposition costs no trained parameters, so the detection head budget stays small (2.98M) while covering five pyramid levels from stride 8 to stride 128.
293
+
294
+ ## Notes and limitations
295
+
296
+ - The segmentation head was trained on ADE20K's 150-class indoor-and-urban label space.
297
+ - The depth head was trained on NYU Depth V2 (indoor). Outdoor metric depth should be treated as approximate.
298
+ - The detection head was trained on COCO 2017's 80-class label space at 768-pixel input. Small-object mAP (22.3) is the weakest axis because the stride-8 P3 level can only resolve objects roughly 10 pixels and larger at that resolution.
299
+ - Correspondence has no confidence signal; it returns a target pixel for every source keypoint regardless of match ambiguity.
300
+
301
+ ## License
302
+
303
+ The EUPE-ViT-B backbone weights inside this checkpoint were released by Meta FAIR under the [FAIR Research License](https://huggingface.co/facebook/EUPE-ViT-B/blob/main/LICENSE), which restricts use to non-commercial research and education. The task heads and class prototypes in this checkpoint were trained independently by the author of this repository and would on their own be releasable under a permissive license. However, because they are inseparably bundled with the backbone weights in a single file, the unified checkpoint inherits the more restrictive license of its most restricted component. In practical terms, both `model.safetensors` and `model.bf16_backbone.safetensors` should be treated as released under the FAIR Research License. See `LICENSE` for the full text.
304
+
305
+ ## Citation
306
+
307
+ ```bibtex
308
+ @misc{zhu2026eupe,
309
+ title={Efficient Universal Perception Encoder},
310
+ author={Zhu, Chenchen and Suri, Saksham and Jose, Cijo and Oquab, Maxime and Szafraniec, Marc and Wen, Wei and Xiong, Yunyang and Labatut, Patrick and Bojanowski, Piotr and Krishnamoorthi, Raghuraman and Chandra, Vikas},
311
+ year={2026},
312
+ eprint={2603.22387},
313
+ archivePrefix={arXiv},
314
+ primaryClass={cs.CV}
315
+ }
316
+ ```
317
+
318
+ ## Acknowledgements
319
+
320
+ The EUPE backbone was trained and released by Meta FAIR. The dataset loading utilities are from the DINOv3 repository. The Argus task heads, benchmarks, and packaging were done by [phanerozoic](https://huggingface.co/phanerozoic).
argus.py ADDED
@@ -0,0 +1,2257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Argus: multi-task perception on a single EUPE-ViT-B backbone.
3
+
4
+ from transformers import AutoModel
5
+ model = AutoModel.from_pretrained("phanerozoic/argus", trust_remote_code=True)
6
+ result = model.perceive(image)
7
+
8
+ The EUPE-ViT-B backbone architecture, all supporting layers, and the Argus
9
+ task heads are inlined below. The backbone code is reproduced from
10
+ facebookresearch/EUPE (Meta FAIR) under the FAIR Research License.
11
+ """
12
+
13
+ import math
14
+ import time
15
+ from functools import partial
16
+ from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import torch
20
+ import torch.nn.functional as F
21
+ import torch.nn.init
22
+ from PIL import Image
23
+ from torch import Tensor, nn
24
+ from torchvision.ops import nms
25
+ from torchvision.transforms import v2
26
+ from transformers import PretrainedConfig, PreTrainedModel
27
+
28
+
29
+ # ===========================================================================
30
+ # EUPE backbone — vendored verbatim from facebookresearch/EUPE
31
+ # ===========================================================================
32
+
33
+ # ---------- utility helpers (from eupe/utils/utils.py) ---------------------
34
+
35
+ def cat_keep_shapes(x_list: List[Tensor]) -> Tuple[Tensor, List[Tuple[int]], List[int]]:
36
+ shapes = [x.shape for x in x_list]
37
+ num_tokens = [x.select(dim=-1, index=0).numel() for x in x_list]
38
+ flattened = torch.cat([x.flatten(0, -2) for x in x_list])
39
+ return flattened, shapes, num_tokens
40
+
41
+
42
+ def uncat_with_shapes(flattened: Tensor, shapes: List[Tuple[int]], num_tokens: List[int]) -> List[Tensor]:
43
+ outputs_splitted = torch.split_with_sizes(flattened, num_tokens, dim=0)
44
+ shapes_adjusted = [shape[:-1] + torch.Size([flattened.shape[-1]]) for shape in shapes]
45
+ outputs_reshaped = [o.reshape(shape) for o, shape in zip(outputs_splitted, shapes_adjusted)]
46
+ return outputs_reshaped
47
+
48
+
49
+ def named_apply(
50
+ fn: Callable,
51
+ module: nn.Module,
52
+ name: str = "",
53
+ depth_first: bool = True,
54
+ include_root: bool = False,
55
+ ) -> nn.Module:
56
+ if not depth_first and include_root:
57
+ fn(module=module, name=name)
58
+ for child_name, child_module in module.named_children():
59
+ child_name = ".".join((name, child_name)) if name else child_name
60
+ named_apply(
61
+ fn=fn,
62
+ module=child_module,
63
+ name=child_name,
64
+ depth_first=depth_first,
65
+ include_root=True,
66
+ )
67
+ if depth_first and include_root:
68
+ fn(module=module, name=name)
69
+ return module
70
+
71
+
72
+ # ---------- RMSNorm (from eupe/layers/rms_norm.py) -------------------------
73
+
74
+ class RMSNorm(nn.Module):
75
+ def __init__(self, dim: int, eps: float = 1e-5):
76
+ super().__init__()
77
+ self.weight = nn.Parameter(torch.ones(dim))
78
+ self.eps = eps
79
+
80
+ def reset_parameters(self) -> None:
81
+ nn.init.constant_(self.weight, 1)
82
+
83
+ def _norm(self, x: Tensor) -> Tensor:
84
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
85
+
86
+ def forward(self, x: Tensor) -> Tensor:
87
+ output = self._norm(x.float()).type_as(x)
88
+ return output * self.weight
89
+
90
+
91
+ # ---------- LayerScale (from eupe/layers/layer_scale.py) -------------------
92
+
93
+ class LayerScale(nn.Module):
94
+ def __init__(
95
+ self,
96
+ dim: int,
97
+ init_values: Union[float, Tensor] = 1e-5,
98
+ inplace: bool = False,
99
+ device=None,
100
+ ) -> None:
101
+ super().__init__()
102
+ self.inplace = inplace
103
+ self.gamma = nn.Parameter(torch.empty(dim, device=device))
104
+ self.init_values = init_values
105
+
106
+ def reset_parameters(self):
107
+ nn.init.constant_(self.gamma, self.init_values)
108
+
109
+ def forward(self, x: Tensor) -> Tensor:
110
+ return x.mul_(self.gamma) if self.inplace else x * self.gamma
111
+
112
+
113
+ # ---------- PatchEmbed (from eupe/layers/patch_embed.py) -------------------
114
+
115
+ def make_2tuple(x):
116
+ if isinstance(x, tuple):
117
+ assert len(x) == 2
118
+ return x
119
+ assert isinstance(x, int)
120
+ return (x, x)
121
+
122
+
123
+ class PatchEmbed(nn.Module):
124
+ def __init__(
125
+ self,
126
+ img_size: Union[int, Tuple[int, int]] = 224,
127
+ patch_size: Union[int, Tuple[int, int]] = 16,
128
+ in_chans: int = 3,
129
+ embed_dim: int = 768,
130
+ norm_layer: Optional[Callable] = None,
131
+ flatten_embedding: bool = True,
132
+ ) -> None:
133
+ super().__init__()
134
+ image_HW = make_2tuple(img_size)
135
+ patch_HW = make_2tuple(patch_size)
136
+ patch_grid_size = (image_HW[0] // patch_HW[0], image_HW[1] // patch_HW[1])
137
+
138
+ self.img_size = image_HW
139
+ self.patch_size = patch_HW
140
+ self.patches_resolution = patch_grid_size
141
+ self.num_patches = patch_grid_size[0] * patch_grid_size[1]
142
+ self.in_chans = in_chans
143
+ self.embed_dim = embed_dim
144
+ self.flatten_embedding = flatten_embedding
145
+
146
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
147
+ self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
148
+
149
+ def forward(self, x: Tensor) -> Tensor:
150
+ _, _, H, W = x.shape
151
+ x = self.proj(x)
152
+ H, W = x.size(2), x.size(3)
153
+ x = x.flatten(2).transpose(1, 2)
154
+ x = self.norm(x)
155
+ if not self.flatten_embedding:
156
+ x = x.reshape(-1, H, W, self.embed_dim)
157
+ return x
158
+
159
+ def reset_parameters(self):
160
+ k = 1 / (self.in_chans * (self.patch_size[0] ** 2))
161
+ nn.init.uniform_(self.proj.weight, -math.sqrt(k), math.sqrt(k))
162
+ if self.proj.bias is not None:
163
+ nn.init.uniform_(self.proj.bias, -math.sqrt(k), math.sqrt(k))
164
+
165
+
166
+ # ---------- RoPE (from eupe/layers/rope_position_encoding.py) --------------
167
+
168
+ class RopePositionEmbedding(nn.Module):
169
+ def __init__(
170
+ self,
171
+ embed_dim: int,
172
+ *,
173
+ num_heads: int,
174
+ base: Optional[float] = 100.0,
175
+ min_period: Optional[float] = None,
176
+ max_period: Optional[float] = None,
177
+ normalize_coords: Literal["min", "max", "separate"] = "separate",
178
+ shift_coords: Optional[float] = None,
179
+ jitter_coords: Optional[float] = None,
180
+ rescale_coords: Optional[float] = None,
181
+ dtype: Optional[torch.dtype] = None,
182
+ device: Optional[torch.device] = None,
183
+ ):
184
+ super().__init__()
185
+ assert embed_dim % (4 * num_heads) == 0
186
+ both_periods = min_period is not None and max_period is not None
187
+ if (base is None and not both_periods) or (base is not None and both_periods):
188
+ raise ValueError("Either `base` or `min_period`+`max_period` must be provided.")
189
+
190
+ D_head = embed_dim // num_heads
191
+ self.base = base
192
+ self.min_period = min_period
193
+ self.max_period = max_period
194
+ self.D_head = D_head
195
+ self.normalize_coords = normalize_coords
196
+ self.shift_coords = shift_coords
197
+ self.jitter_coords = jitter_coords
198
+ self.rescale_coords = rescale_coords
199
+
200
+ self.dtype = dtype
201
+ self.register_buffer(
202
+ "periods",
203
+ torch.empty(D_head // 4, device=device, dtype=dtype),
204
+ persistent=True,
205
+ )
206
+ self._init_weights()
207
+
208
+ def forward(self, *, H: int, W: int) -> Tuple[Tensor, Tensor]:
209
+ device = self.periods.device
210
+ dtype = self.dtype
211
+ dd = {"device": device, "dtype": dtype}
212
+
213
+ if self.normalize_coords == "max":
214
+ max_HW = max(H, W)
215
+ coords_h = torch.arange(0.5, H, **dd) / max_HW
216
+ coords_w = torch.arange(0.5, W, **dd) / max_HW
217
+ elif self.normalize_coords == "min":
218
+ min_HW = min(H, W)
219
+ coords_h = torch.arange(0.5, H, **dd) / min_HW
220
+ coords_w = torch.arange(0.5, W, **dd) / min_HW
221
+ elif self.normalize_coords == "separate":
222
+ coords_h = torch.arange(0.5, H, **dd) / H
223
+ coords_w = torch.arange(0.5, W, **dd) / W
224
+ else:
225
+ raise ValueError(f"Unknown normalize_coords: {self.normalize_coords}")
226
+ coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1)
227
+ coords = coords.flatten(0, 1)
228
+ coords = 2.0 * coords - 1.0
229
+
230
+ if self.training and self.shift_coords is not None:
231
+ shift_hw = torch.empty(2, **dd).uniform_(-self.shift_coords, self.shift_coords)
232
+ coords += shift_hw[None, :]
233
+
234
+ if self.training and self.jitter_coords is not None:
235
+ jitter_max = np.log(self.jitter_coords)
236
+ jitter_min = -jitter_max
237
+ jitter_hw = torch.empty(2, **dd).uniform_(jitter_min, jitter_max).exp()
238
+ coords *= jitter_hw[None, :]
239
+
240
+ if self.training and self.rescale_coords is not None:
241
+ rescale_max = np.log(self.rescale_coords)
242
+ rescale_min = -rescale_max
243
+ rescale_hw = torch.empty(1, **dd).uniform_(rescale_min, rescale_max).exp()
244
+ coords *= rescale_hw
245
+
246
+ angles = 2 * math.pi * coords[:, :, None] / self.periods[None, None, :]
247
+ angles = angles.flatten(1, 2)
248
+ angles = angles.tile(2)
249
+ cos = torch.cos(angles)
250
+ sin = torch.sin(angles)
251
+ return (sin, cos)
252
+
253
+ def _init_weights(self):
254
+ device = self.periods.device
255
+ dtype = self.dtype
256
+ if self.base is not None:
257
+ periods = self.base ** (
258
+ 2 * torch.arange(self.D_head // 4, device=device, dtype=dtype) / (self.D_head // 2)
259
+ )
260
+ else:
261
+ base = self.max_period / self.min_period
262
+ exponents = torch.linspace(0, 1, self.D_head // 4, device=device, dtype=dtype)
263
+ periods = base ** exponents
264
+ periods = periods / base
265
+ periods = periods * self.max_period
266
+ self.periods.data = periods
267
+
268
+
269
+ # ---------- FFN layers (from eupe/layers/ffn_layers.py) --------------------
270
+
271
+ class ListForwardMixin(object):
272
+ def forward(self, x: Tensor):
273
+ raise NotImplementedError
274
+
275
+ def forward_list(self, x_list: List[Tensor]) -> List[Tensor]:
276
+ x_flat, shapes, num_tokens = cat_keep_shapes(x_list)
277
+ x_flat = self.forward(x_flat)
278
+ return uncat_with_shapes(x_flat, shapes, num_tokens)
279
+
280
+
281
+ class Mlp(nn.Module, ListForwardMixin):
282
+ def __init__(
283
+ self,
284
+ in_features: int,
285
+ hidden_features: Optional[int] = None,
286
+ out_features: Optional[int] = None,
287
+ act_layer: Callable[..., nn.Module] = nn.GELU,
288
+ drop: float = 0.0,
289
+ bias: bool = True,
290
+ device=None,
291
+ ) -> None:
292
+ super().__init__()
293
+ out_features = out_features or in_features
294
+ hidden_features = hidden_features or in_features
295
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias, device=device)
296
+ self.act = act_layer()
297
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=bias, device=device)
298
+ self.drop = nn.Dropout(drop)
299
+
300
+ def forward(self, x: Tensor) -> Tensor:
301
+ x = self.fc1(x)
302
+ x = self.act(x)
303
+ x = self.drop(x)
304
+ x = self.fc2(x)
305
+ x = self.drop(x)
306
+ return x
307
+
308
+
309
+ class SwiGLUFFN(nn.Module, ListForwardMixin):
310
+ def __init__(
311
+ self,
312
+ in_features: int,
313
+ hidden_features: Optional[int] = None,
314
+ out_features: Optional[int] = None,
315
+ act_layer: Optional[Callable[..., nn.Module]] = None,
316
+ drop: float = 0.0,
317
+ bias: bool = True,
318
+ align_to: int = 8,
319
+ device=None,
320
+ ) -> None:
321
+ super().__init__()
322
+ out_features = out_features or in_features
323
+ hidden_features = hidden_features or in_features
324
+ d = int(hidden_features * 2 / 3)
325
+ swiglu_hidden_features = d + (-d % align_to)
326
+ self.w1 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device)
327
+ self.w2 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device)
328
+ self.w3 = nn.Linear(swiglu_hidden_features, out_features, bias=bias, device=device)
329
+
330
+ def forward(self, x: Tensor) -> Tensor:
331
+ x1 = self.w1(x)
332
+ x2 = self.w2(x)
333
+ hidden = F.silu(x1) * x2
334
+ return self.w3(hidden)
335
+
336
+
337
+ # ---------- Attention (from eupe/layers/attention.py) ----------------------
338
+
339
+ def rope_rotate_half(x: Tensor) -> Tensor:
340
+ x1, x2 = x.chunk(2, dim=-1)
341
+ return torch.cat([-x2, x1], dim=-1)
342
+
343
+
344
+ def rope_apply(x: Tensor, sin: Tensor, cos: Tensor) -> Tensor:
345
+ return (x * cos) + (rope_rotate_half(x) * sin)
346
+
347
+
348
+ class LinearKMaskedBias(nn.Linear):
349
+ def __init__(self, *args, **kwargs):
350
+ super().__init__(*args, **kwargs)
351
+ o = self.out_features
352
+ assert o % 3 == 0
353
+ if self.bias is not None:
354
+ self.register_buffer("bias_mask", torch.full_like(self.bias, fill_value=math.nan))
355
+
356
+ def forward(self, input: Tensor) -> Tensor:
357
+ masked_bias = self.bias * self.bias_mask.to(self.bias.dtype) if self.bias is not None else None
358
+ return F.linear(input, self.weight, masked_bias)
359
+
360
+
361
+ class SelfAttention(nn.Module):
362
+ def __init__(
363
+ self,
364
+ dim: int,
365
+ num_heads: int = 8,
366
+ qkv_bias: bool = False,
367
+ proj_bias: bool = True,
368
+ attn_drop: float = 0.0,
369
+ proj_drop: float = 0.0,
370
+ mask_k_bias: bool = False,
371
+ device=None,
372
+ ) -> None:
373
+ super().__init__()
374
+ self.num_heads = num_heads
375
+ head_dim = dim // num_heads
376
+ self.scale = head_dim ** -0.5
377
+
378
+ linear_class = LinearKMaskedBias if mask_k_bias else nn.Linear
379
+ self.qkv = linear_class(dim, dim * 3, bias=qkv_bias, device=device)
380
+ self.attn_drop = nn.Dropout(attn_drop)
381
+ self.proj = nn.Linear(dim, dim, bias=proj_bias, device=device)
382
+ self.proj_drop = nn.Dropout(proj_drop)
383
+
384
+ def apply_rope(self, q: Tensor, k: Tensor, rope) -> Tuple[Tensor, Tensor]:
385
+ q_dtype = q.dtype
386
+ k_dtype = k.dtype
387
+ sin, cos = rope
388
+ rope_dtype = sin.dtype
389
+ q = q.to(dtype=rope_dtype)
390
+ k = k.to(dtype=rope_dtype)
391
+ N = q.shape[-2]
392
+ prefix = N - sin.shape[-2]
393
+ assert prefix >= 0
394
+ q_prefix = q[:, :, :prefix, :]
395
+ q = rope_apply(q[:, :, prefix:, :], sin, cos)
396
+ q = torch.cat((q_prefix, q), dim=-2)
397
+ k_prefix = k[:, :, :prefix, :]
398
+ k = rope_apply(k[:, :, prefix:, :], sin, cos)
399
+ k = torch.cat((k_prefix, k), dim=-2)
400
+ q = q.to(dtype=q_dtype)
401
+ k = k.to(dtype=k_dtype)
402
+ return q, k
403
+
404
+ def forward(self, x: Tensor, attn_bias=None, rope=None) -> Tensor:
405
+ qkv = self.qkv(x)
406
+ attn_v = self.compute_attention(qkv=qkv, attn_bias=attn_bias, rope=rope)
407
+ x = self.proj(attn_v)
408
+ x = self.proj_drop(x)
409
+ return x
410
+
411
+ def forward_list(self, x_list, attn_bias=None, rope_list=None) -> List[Tensor]:
412
+ assert len(x_list) == len(rope_list)
413
+ x_flat, shapes, num_tokens = cat_keep_shapes(x_list)
414
+ qkv_flat = self.qkv(x_flat)
415
+ qkv_list = uncat_with_shapes(qkv_flat, shapes, num_tokens)
416
+ att_out = []
417
+ for _, (qkv, _, rope) in enumerate(zip(qkv_list, shapes, rope_list)):
418
+ att_out.append(self.compute_attention(qkv, attn_bias=attn_bias, rope=rope))
419
+ x_flat, shapes, num_tokens = cat_keep_shapes(att_out)
420
+ x_flat = self.proj(x_flat)
421
+ return uncat_with_shapes(x_flat, shapes, num_tokens)
422
+
423
+ def compute_attention(self, qkv: Tensor, attn_bias=None, rope=None) -> Tensor:
424
+ assert attn_bias is None
425
+ B, N, _ = qkv.shape
426
+ C = self.qkv.in_features
427
+ qkv = qkv.reshape(B, N, 3, self.num_heads, C // self.num_heads)
428
+ q, k, v = torch.unbind(qkv, 2)
429
+ q, k, v = [t.transpose(1, 2) for t in [q, k, v]]
430
+ if rope is not None:
431
+ q, k = self.apply_rope(q, k, rope)
432
+ x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
433
+ x = x.transpose(1, 2)
434
+ return x.reshape([B, N, C])
435
+
436
+
437
+ # ---------- Block (from eupe/layers/block.py) ------------------------------
438
+
439
+ class SelfAttentionBlock(nn.Module):
440
+ def __init__(
441
+ self,
442
+ dim: int,
443
+ num_heads: int,
444
+ ffn_ratio: float = 4.0,
445
+ qkv_bias: bool = False,
446
+ proj_bias: bool = True,
447
+ ffn_bias: bool = True,
448
+ drop: float = 0.0,
449
+ attn_drop: float = 0.0,
450
+ init_values=None,
451
+ drop_path: float = 0.0,
452
+ act_layer: Callable[..., nn.Module] = nn.GELU,
453
+ norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
454
+ attn_class: Callable[..., nn.Module] = SelfAttention,
455
+ ffn_layer: Callable[..., nn.Module] = Mlp,
456
+ mask_k_bias: bool = False,
457
+ device=None,
458
+ ) -> None:
459
+ super().__init__()
460
+ self.norm1 = norm_layer(dim)
461
+ self.attn = attn_class(
462
+ dim,
463
+ num_heads=num_heads,
464
+ qkv_bias=qkv_bias,
465
+ proj_bias=proj_bias,
466
+ attn_drop=attn_drop,
467
+ proj_drop=drop,
468
+ mask_k_bias=mask_k_bias,
469
+ device=device,
470
+ )
471
+ self.ls1 = LayerScale(dim, init_values=init_values, device=device) if init_values else nn.Identity()
472
+ self.norm2 = norm_layer(dim)
473
+ mlp_hidden_dim = int(dim * ffn_ratio)
474
+ self.mlp = ffn_layer(
475
+ in_features=dim,
476
+ hidden_features=mlp_hidden_dim,
477
+ act_layer=act_layer,
478
+ drop=drop,
479
+ bias=ffn_bias,
480
+ device=device,
481
+ )
482
+ self.ls2 = LayerScale(dim, init_values=init_values, device=device) if init_values else nn.Identity()
483
+ self.sample_drop_ratio = drop_path
484
+
485
+ @staticmethod
486
+ def _maybe_index_rope(rope, indices: Tensor):
487
+ if rope is None:
488
+ return None
489
+ sin, cos = rope
490
+ assert sin.ndim == cos.ndim
491
+ if sin.ndim == 4:
492
+ return sin[indices], cos[indices]
493
+ return sin, cos
494
+
495
+ def _forward_list(self, x_list: List[Tensor], rope_list=None) -> List[Tensor]:
496
+ b_list = [x.shape[0] for x in x_list]
497
+ sample_subset_sizes = [max(int(b * (1 - self.sample_drop_ratio)), 1) for b in b_list]
498
+
499
+ if self.training and self.sample_drop_ratio > 0.0:
500
+ residual_scale_factors = [b / s for b, s in zip(b_list, sample_subset_sizes)]
501
+ indices_1_list = [
502
+ torch.randperm(b, device=x.device)[:s]
503
+ for x, b, s in zip(x_list, b_list, sample_subset_sizes)
504
+ ]
505
+ x_subset_1_list = [x[i] for x, i in zip(x_list, indices_1_list)]
506
+ if rope_list is not None:
507
+ rope_subset_list = [
508
+ self._maybe_index_rope(r, i) for r, i in zip(rope_list, indices_1_list)
509
+ ]
510
+ else:
511
+ rope_subset_list = rope_list
512
+
513
+ flattened, shapes, num_tokens = cat_keep_shapes(x_subset_1_list)
514
+ norm1 = uncat_with_shapes(self.norm1(flattened), shapes, num_tokens)
515
+ residual_1_list = self.attn.forward_list(norm1, rope_list=rope_subset_list)
516
+
517
+ x_attn_list = [
518
+ torch.index_add(x, dim=0, source=self.ls1(r1), index=i1, alpha=rsf)
519
+ for x, r1, i1, rsf in zip(x_list, residual_1_list, indices_1_list, residual_scale_factors)
520
+ ]
521
+
522
+ indices_2_list = [
523
+ torch.randperm(b, device=x.device)[:s]
524
+ for x, b, s in zip(x_list, b_list, sample_subset_sizes)
525
+ ]
526
+ x_subset_2_list = [x[i] for x, i in zip(x_attn_list, indices_2_list)]
527
+ flattened, shapes, num_tokens = cat_keep_shapes(x_subset_2_list)
528
+ norm2_list = uncat_with_shapes(self.norm2(flattened), shapes, num_tokens)
529
+ residual_2_list = self.mlp.forward_list(norm2_list)
530
+
531
+ x_ffn = [
532
+ torch.index_add(xa, dim=0, source=self.ls2(r2), index=i2, alpha=rsf)
533
+ for xa, r2, i2, rsf in zip(x_attn_list, residual_2_list, indices_2_list, residual_scale_factors)
534
+ ]
535
+ else:
536
+ x_out = []
537
+ for x, rope in zip(x_list, rope_list):
538
+ x_attn = x + self.ls1(self.attn(self.norm1(x), rope=rope))
539
+ x_ffn = x_attn + self.ls2(self.mlp(self.norm2(x_attn)))
540
+ x_out.append(x_ffn)
541
+ x_ffn = x_out
542
+ return x_ffn
543
+
544
+ def forward(self, x_or_x_list, rope_or_rope_list=None) -> List[Tensor]:
545
+ if isinstance(x_or_x_list, Tensor):
546
+ return self._forward_list([x_or_x_list], rope_list=[rope_or_rope_list])[0]
547
+ elif isinstance(x_or_x_list, list):
548
+ if rope_or_rope_list is None:
549
+ rope_or_rope_list = [None for _ in x_or_x_list]
550
+ return self._forward_list(x_or_x_list, rope_list=rope_or_rope_list)
551
+ raise AssertionError
552
+
553
+
554
+ # ---------- DinoVisionTransformer (from eupe/models/vision_transformer.py)
555
+
556
+ ffn_layer_dict = {
557
+ "mlp": Mlp,
558
+ "swiglu": SwiGLUFFN,
559
+ "swiglu32": partial(SwiGLUFFN, align_to=32),
560
+ "swiglu64": partial(SwiGLUFFN, align_to=64),
561
+ "swiglu128": partial(SwiGLUFFN, align_to=128),
562
+ }
563
+
564
+ norm_layer_dict = {
565
+ "layernorm": partial(nn.LayerNorm, eps=1e-6),
566
+ "layernormbf16": partial(nn.LayerNorm, eps=1e-5),
567
+ "rmsnorm": RMSNorm,
568
+ }
569
+
570
+ dtype_dict = {
571
+ "fp32": torch.float32,
572
+ "fp16": torch.float16,
573
+ "bf16": torch.bfloat16,
574
+ }
575
+
576
+
577
+ def init_weights_vit(module: nn.Module, name: str = ""):
578
+ if isinstance(module, nn.Linear):
579
+ torch.nn.init.trunc_normal_(module.weight, std=0.02)
580
+ if module.bias is not None:
581
+ nn.init.zeros_(module.bias)
582
+ if hasattr(module, "bias_mask") and module.bias_mask is not None:
583
+ o = module.out_features
584
+ module.bias_mask.fill_(1)
585
+ module.bias_mask[o // 3 : 2 * o // 3].fill_(0)
586
+ if isinstance(module, nn.LayerNorm):
587
+ module.reset_parameters()
588
+ if isinstance(module, LayerScale):
589
+ module.reset_parameters()
590
+ if isinstance(module, PatchEmbed):
591
+ module.reset_parameters()
592
+ if isinstance(module, RMSNorm):
593
+ module.reset_parameters()
594
+
595
+
596
+ class DinoVisionTransformer(nn.Module):
597
+ def __init__(
598
+ self,
599
+ *,
600
+ img_size: int = 224,
601
+ patch_size: int = 16,
602
+ in_chans: int = 3,
603
+ pos_embed_rope_base: float = 100.0,
604
+ pos_embed_rope_min_period: Optional[float] = None,
605
+ pos_embed_rope_max_period: Optional[float] = None,
606
+ pos_embed_rope_normalize_coords: Literal["min", "max", "separate"] = "separate",
607
+ pos_embed_rope_shift_coords: Optional[float] = None,
608
+ pos_embed_rope_jitter_coords: Optional[float] = None,
609
+ pos_embed_rope_rescale_coords: Optional[float] = None,
610
+ pos_embed_rope_dtype: str = "bf16",
611
+ embed_dim: int = 768,
612
+ depth: int = 12,
613
+ num_heads: int = 12,
614
+ ffn_ratio: float = 4.0,
615
+ qkv_bias: bool = True,
616
+ drop_path_rate: float = 0.0,
617
+ layerscale_init: Optional[float] = None,
618
+ norm_layer: str = "layernorm",
619
+ ffn_layer: str = "mlp",
620
+ ffn_bias: bool = True,
621
+ proj_bias: bool = True,
622
+ n_storage_tokens: int = 0,
623
+ mask_k_bias: bool = False,
624
+ untie_cls_and_patch_norms: bool = False,
625
+ untie_global_and_local_cls_norm: bool = False,
626
+ device: Any = None,
627
+ **ignored_kwargs,
628
+ ):
629
+ super().__init__()
630
+ del ignored_kwargs
631
+
632
+ norm_layer_cls = norm_layer_dict[norm_layer]
633
+
634
+ self.num_features = self.embed_dim = embed_dim
635
+ self.n_blocks = depth
636
+ self.num_heads = num_heads
637
+ self.patch_size = patch_size
638
+
639
+ self.patch_embed = PatchEmbed(
640
+ img_size=img_size,
641
+ patch_size=patch_size,
642
+ in_chans=in_chans,
643
+ embed_dim=embed_dim,
644
+ flatten_embedding=False,
645
+ )
646
+
647
+ self.cls_token = nn.Parameter(torch.empty(1, 1, embed_dim, device=device))
648
+ self.n_storage_tokens = n_storage_tokens
649
+ if self.n_storage_tokens > 0:
650
+ self.storage_tokens = nn.Parameter(torch.empty(1, n_storage_tokens, embed_dim, device=device))
651
+
652
+ self.rope_embed = RopePositionEmbedding(
653
+ embed_dim=embed_dim,
654
+ num_heads=num_heads,
655
+ base=pos_embed_rope_base,
656
+ min_period=pos_embed_rope_min_period,
657
+ max_period=pos_embed_rope_max_period,
658
+ normalize_coords=pos_embed_rope_normalize_coords,
659
+ shift_coords=pos_embed_rope_shift_coords,
660
+ jitter_coords=pos_embed_rope_jitter_coords,
661
+ rescale_coords=pos_embed_rope_rescale_coords,
662
+ dtype=dtype_dict[pos_embed_rope_dtype],
663
+ device=device,
664
+ )
665
+
666
+ ffn_layer_cls = ffn_layer_dict[ffn_layer]
667
+ ffn_ratio_sequence = [ffn_ratio] * depth
668
+ blocks_list = [
669
+ SelfAttentionBlock(
670
+ dim=embed_dim,
671
+ num_heads=num_heads,
672
+ ffn_ratio=ffn_ratio_sequence[i],
673
+ qkv_bias=qkv_bias,
674
+ proj_bias=proj_bias,
675
+ ffn_bias=ffn_bias,
676
+ drop_path=drop_path_rate,
677
+ norm_layer=norm_layer_cls,
678
+ act_layer=nn.GELU,
679
+ ffn_layer=ffn_layer_cls,
680
+ init_values=layerscale_init,
681
+ mask_k_bias=mask_k_bias,
682
+ device=device,
683
+ )
684
+ for i in range(depth)
685
+ ]
686
+
687
+ self.chunked_blocks = False
688
+ self.blocks = nn.ModuleList(blocks_list)
689
+ self.norm = norm_layer_cls(embed_dim)
690
+
691
+ self.untie_cls_and_patch_norms = untie_cls_and_patch_norms
692
+ self.cls_norm = norm_layer_cls(embed_dim) if untie_cls_and_patch_norms else None
693
+
694
+ self.untie_global_and_local_cls_norm = untie_global_and_local_cls_norm
695
+ self.local_cls_norm = norm_layer_cls(embed_dim) if untie_global_and_local_cls_norm else None
696
+
697
+ self.head = nn.Identity()
698
+ self.mask_token = nn.Parameter(torch.empty(1, embed_dim, device=device))
699
+
700
+ def init_weights(self):
701
+ self.rope_embed._init_weights()
702
+ nn.init.normal_(self.cls_token, std=0.02)
703
+ if self.n_storage_tokens > 0:
704
+ nn.init.normal_(self.storage_tokens, std=0.02)
705
+ nn.init.zeros_(self.mask_token)
706
+ named_apply(init_weights_vit, self)
707
+
708
+ def prepare_tokens_with_masks(self, x: Tensor, masks=None) -> Tuple[Tensor, Tuple[int, int]]:
709
+ x = self.patch_embed(x)
710
+ B, H, W, _ = x.shape
711
+ x = x.flatten(1, 2)
712
+
713
+ if masks is not None:
714
+ x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
715
+ cls_token = self.cls_token
716
+ else:
717
+ cls_token = self.cls_token + 0 * self.mask_token
718
+
719
+ if self.n_storage_tokens > 0:
720
+ storage_tokens = self.storage_tokens
721
+ else:
722
+ storage_tokens = torch.empty(
723
+ 1, 0, cls_token.shape[-1],
724
+ dtype=cls_token.dtype, device=cls_token.device,
725
+ )
726
+
727
+ x = torch.cat(
728
+ [cls_token.expand(B, -1, -1), storage_tokens.expand(B, -1, -1), x],
729
+ dim=1,
730
+ )
731
+ return x, (H, W)
732
+
733
+ def forward_features_list(self, x_list: List[Tensor], masks_list: List[Tensor]) -> List[Dict[str, Tensor]]:
734
+ x = []
735
+ rope = []
736
+ for t_x, t_masks in zip(x_list, masks_list):
737
+ t2_x, hw_tuple = self.prepare_tokens_with_masks(t_x, t_masks)
738
+ x.append(t2_x)
739
+ rope.append(hw_tuple)
740
+ for blk in self.blocks:
741
+ if self.rope_embed is not None:
742
+ rope_sincos = [self.rope_embed(H=H, W=W) for H, W in rope]
743
+ else:
744
+ rope_sincos = [None for _ in rope]
745
+ x = blk(x, rope_sincos)
746
+ all_x = x
747
+ output = []
748
+ for idx, (x, masks) in enumerate(zip(all_x, masks_list)):
749
+ if self.untie_cls_and_patch_norms or self.untie_global_and_local_cls_norm:
750
+ if self.untie_global_and_local_cls_norm and self.training and idx == 1:
751
+ x_norm_cls_reg = self.local_cls_norm(x[:, : self.n_storage_tokens + 1])
752
+ elif self.untie_cls_and_patch_norms:
753
+ x_norm_cls_reg = self.cls_norm(x[:, : self.n_storage_tokens + 1])
754
+ else:
755
+ x_norm_cls_reg = self.norm(x[:, : self.n_storage_tokens + 1])
756
+ x_norm_patch = self.norm(x[:, self.n_storage_tokens + 1 :])
757
+ else:
758
+ x_norm = self.norm(x)
759
+ x_norm_cls_reg = x_norm[:, : self.n_storage_tokens + 1]
760
+ x_norm_patch = x_norm[:, self.n_storage_tokens + 1 :]
761
+ output.append({
762
+ "x_norm_clstoken": x_norm_cls_reg[:, 0],
763
+ "x_storage_tokens": x_norm_cls_reg[:, 1:],
764
+ "x_norm_patchtokens": x_norm_patch,
765
+ "x_prenorm": x,
766
+ "masks": masks,
767
+ })
768
+ return output
769
+
770
+ def forward_features(self, x, masks: Optional[Tensor] = None):
771
+ if isinstance(x, torch.Tensor):
772
+ return self.forward_features_list([x], [masks])[0]
773
+ return self.forward_features_list(x, masks)
774
+
775
+ def forward(self, *args, is_training: bool = False, **kwargs):
776
+ ret = self.forward_features(*args, **kwargs)
777
+ if is_training:
778
+ return ret
779
+ return self.head(ret["x_norm_clstoken"])
780
+
781
+
782
+ def build_eupe_vitb16() -> DinoVisionTransformer:
783
+ # qkv_bias=False, mask_k_bias=False: the upstream EUPE-ViT-B release shipped
784
+ # with `qkv.bias_mask` filled with zeros, which makes the effective qkv bias
785
+ # zero at every block (masked_bias = bias * 0 = 0). We drop the bias parameter
786
+ # entirely here — the computation is bitwise-equivalent in fp32, bf16 output
787
+ # drift is sub-ULP and absorbed by every head except DPT depth (where it
788
+ # appears as ~2cm noise against a 39cm RMSE, i.e. below the head's own floor).
789
+ return DinoVisionTransformer(
790
+ img_size=224,
791
+ patch_size=16,
792
+ in_chans=3,
793
+ pos_embed_rope_base=100,
794
+ pos_embed_rope_normalize_coords="separate",
795
+ pos_embed_rope_rescale_coords=2,
796
+ pos_embed_rope_dtype="fp32",
797
+ embed_dim=768,
798
+ depth=12,
799
+ num_heads=12,
800
+ ffn_ratio=4,
801
+ qkv_bias=False,
802
+ drop_path_rate=0.0,
803
+ layerscale_init=1.0e-05,
804
+ norm_layer="layernormbf16",
805
+ ffn_layer="mlp",
806
+ ffn_bias=True,
807
+ proj_bias=True,
808
+ n_storage_tokens=4,
809
+ mask_k_bias=False,
810
+ )
811
+
812
+
813
+ # ===========================================================================
814
+ # Argus task heads
815
+ # ===========================================================================
816
+
817
+ def make_eupe_transform(resize_size: int):
818
+ return v2.Compose([
819
+ v2.ToImage(),
820
+ v2.Resize((resize_size, resize_size), antialias=True),
821
+ v2.ToDtype(torch.float32, scale=True),
822
+ v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
823
+ ])
824
+
825
+
826
+ def _normalize_image_input(image_or_images) -> Tuple[bool, list]:
827
+ """Returns (was_single, [images]). Accepts a PIL.Image or an iterable of them."""
828
+ if isinstance(image_or_images, Image.Image):
829
+ return True, [image_or_images]
830
+ images = list(image_or_images)
831
+ if not images:
832
+ raise ValueError("empty image list")
833
+ for i, img in enumerate(images):
834
+ if not isinstance(img, Image.Image):
835
+ raise TypeError(f"images[{i}] is {type(img).__name__}, expected PIL.Image")
836
+ return False, images
837
+
838
+
839
+ class _BackboneExportWrapper(nn.Module):
840
+ """ONNX-friendly wrapper: returns (cls, spatial) instead of a dict."""
841
+
842
+ def __init__(self, backbone: nn.Module):
843
+ super().__init__()
844
+ self.backbone = backbone
845
+
846
+ def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]:
847
+ out = self.backbone.forward_features(x)
848
+ cls = out["x_norm_clstoken"]
849
+ patches = out["x_norm_patchtokens"]
850
+ B, N, D = patches.shape
851
+ h = w = int(N ** 0.5)
852
+ spatial = patches.permute(0, 2, 1).reshape(B, D, h, w)
853
+ return cls, spatial
854
+
855
+
856
+ class _SegHeadExportWrapper(nn.Module):
857
+ """ONNX-friendly wrapper: seg head + bilinear upsample to input resolution.
858
+
859
+ The bare seg head emits stride-16 logits (e.g. [B, 150, 40, 40] at 640px
860
+ input). model.segment() upsamples those to the input resolution before
861
+ argmax. This wrapper folds the upsample into the graph so the ONNX seg
862
+ output is already at input resolution — consumers argmax directly without
863
+ a separate interpolation step.
864
+ """
865
+
866
+ def __init__(self, seg_head: nn.Module, resolution: int):
867
+ super().__init__()
868
+ self.seg_head = seg_head
869
+ self.resolution = resolution
870
+
871
+ def forward(self, spatial_features: Tensor) -> Tensor:
872
+ logits = self.seg_head(spatial_features)
873
+ return F.interpolate(logits, size=(self.resolution, self.resolution),
874
+ mode="bilinear", align_corners=False)
875
+
876
+
877
+ class _DepthHeadExportWrapper(nn.Module):
878
+ """ONNX-friendly wrapper for the DPT depth head.
879
+
880
+ DPTDepthDecoder.forward takes (intermediates: List[Tensor], H: int, W: int),
881
+ which torch.onnx.export cannot trace cleanly because the List contains four
882
+ tensors and H/W are Python ints. The wrapper accepts the four intermediate
883
+ ViT-block activations as separate positional tensor inputs and forwards them
884
+ to the underlying decoder with the captured H and W.
885
+ """
886
+
887
+ def __init__(self, depth_head: nn.Module, H: int, W: int):
888
+ super().__init__()
889
+ self.depth_head = depth_head
890
+ self.H = H
891
+ self.W = W
892
+
893
+ def forward(self, inter0: Tensor, inter1: Tensor, inter2: Tensor, inter3: Tensor) -> Tensor:
894
+ return self.depth_head([inter0, inter1, inter2, inter3], self.H, self.W)
895
+
896
+
897
+ class _ClassifierExportWrapper(nn.Module):
898
+ """ONNX-friendly wrapper for the ImageNet linear-softmax classifier.
899
+
900
+ Takes the backbone's CLS token, L2-normalizes, applies the stored
901
+ Linear(embed_dim, 1000) weight + bias, and returns a softmax
902
+ distribution over the 1000 ImageNet classes. The weight and bias are
903
+ captured as buffers so the graph is self-contained — no separate
904
+ weight file needed for classification inference.
905
+ """
906
+
907
+ def __init__(self, class_weight: Tensor, class_bias: Tensor):
908
+ super().__init__()
909
+ self.register_buffer("weight", class_weight.float().clone())
910
+ self.register_buffer("bias", class_bias.float().clone())
911
+
912
+ def forward(self, cls_token: Tensor) -> Tensor:
913
+ x = F.normalize(cls_token, dim=-1)
914
+ logits = F.linear(x, self.weight, self.bias)
915
+ return F.softmax(logits, dim=-1)
916
+
917
+
918
+ class _ONNXBatchedNMS(torch.autograd.Function):
919
+ """Autograd wrapper that exports to ONNX NonMaxSuppression (opset >= 10).
920
+
921
+ ONNX's NonMaxSuppression handles batched multi-class NMS natively:
922
+ boxes [B, N, 4] in [y1, x1, y2, x2] order (center_point_box=0)
923
+ scores [B, C, N]
924
+ -> selected_indices [M, 3] where each row is [batch, class, box]
925
+
926
+ The eager forward path reproduces this via torchvision.ops.nms so
927
+ PyTorch tracing and verify=True both work without calling into
928
+ ORT for the reference.
929
+ """
930
+
931
+ @staticmethod
932
+ def symbolic(g, boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold):
933
+ return g.op(
934
+ "NonMaxSuppression",
935
+ boxes, scores,
936
+ max_output_boxes_per_class,
937
+ iou_threshold,
938
+ score_threshold,
939
+ center_point_box_i=0,
940
+ )
941
+
942
+ @staticmethod
943
+ def forward(ctx, boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold):
944
+ from torchvision.ops import nms as tv_nms
945
+ B, N, _ = boxes.shape
946
+ _, C, _ = scores.shape
947
+ max_out = int(max_output_boxes_per_class.item())
948
+ iou_thr = float(iou_threshold.item())
949
+ score_thr = float(score_threshold.item())
950
+ results: List[List[int]] = []
951
+ for b in range(B):
952
+ for c in range(C):
953
+ sc = scores[b, c]
954
+ mask = sc > score_thr
955
+ if not mask.any():
956
+ continue
957
+ idx = mask.nonzero(as_tuple=True)[0]
958
+ # tv_nms expects [x1, y1, x2, y2]; our boxes are [y1, x1, y2, x2].
959
+ bx_xyxy = boxes[b, idx][:, [1, 0, 3, 2]]
960
+ keep = tv_nms(bx_xyxy, sc[idx], iou_thr)[:max_out]
961
+ for k in keep.tolist():
962
+ results.append([b, c, int(idx[k].item())])
963
+ if not results:
964
+ return torch.zeros((0, 3), dtype=torch.long, device=boxes.device)
965
+ return torch.tensor(results, dtype=torch.long, device=boxes.device)
966
+
967
+
968
+ class _DetectionHeadExportWrapper(nn.Module):
969
+ """ONNX-friendly wrapper for the detection head (simple FPN + FCOS).
970
+
971
+ Takes backbone stride-16 spatial features and returns decoded
972
+ per-location predictions concatenated across all five FPN levels.
973
+
974
+ Without NMS (default):
975
+ - boxes [B, N_total, 4] xyxy in input-resolution pixels,
976
+ decoded as (location - exp(reg)) /
977
+ (location + exp(reg)) and clamped.
978
+ - scores [B, N_total, num_classes]
979
+ sigmoid(cls_logits) * sigmoid(centerness).
980
+
981
+ With NMS (include_nms=True):
982
+ - boxes [M, 4] xyxy in input-resolution pixels
983
+ - scores [M]
984
+ - class_labels [M] int64 class index
985
+ - batch_indices[M] int64 batch index
986
+
987
+ N_total = sum(H_i * W_i) across strides [8, 16, 32, 64, 128]. At
988
+ 640px input: 6400 + 1600 + 400 + 100 + 25 = 8525 locations/image.
989
+
990
+ The NMS variant folds ONNX's NonMaxSuppression (opset >= 10) into
991
+ the graph using the configured iou / score / max_detections
992
+ parameters, producing a flat list of surviving detections across
993
+ all batches and classes. Useful for single-shot TensorRT / mobile
994
+ inference. Without NMS the consumer runs their own — hard vs soft,
995
+ per-class vs global, threshold tuning — without re-exporting.
996
+ """
997
+
998
+ def __init__(self, detection_head: nn.Module, resolution: int,
999
+ include_nms: bool = False,
1000
+ nms_iou_threshold: float = 0.5,
1001
+ nms_score_threshold: float = 0.05,
1002
+ nms_max_detections: int = 100):
1003
+ super().__init__()
1004
+ self.detection_head = detection_head
1005
+ self.resolution = resolution
1006
+ self.num_classes = detection_head.num_classes
1007
+ self.include_nms = include_nms
1008
+ self.nms_iou_threshold = nms_iou_threshold
1009
+ self.nms_score_threshold = nms_score_threshold
1010
+ self.nms_max_detections = nms_max_detections
1011
+
1012
+ # Compute per-level spatial sizes from the SimpleFeaturePyramid's actual
1013
+ # output shapes, not from resolution // stride. The pyramid starts at
1014
+ # stride-16 backbone features (H = resolution // 16) and produces:
1015
+ # P3 = 2*H via ConvTranspose2d(stride=2)
1016
+ # P4 = H via 1x1 + 3x3 convs (no stride)
1017
+ # P5 = (H+1)//2 via Conv2d(3x3, stride=2, padding=1)
1018
+ # P6 = (P5+1)//2 via Conv2d on P5
1019
+ # P7 = (P6+1)//2 via Conv2d on P6
1020
+ # When resolution is a multiple of 128, these match resolution // stride
1021
+ # exactly; at other resolutions the stride-2 convs round up via the
1022
+ # padding=1 kernel=3 formula, so P6/P7 are slightly larger than
1023
+ # nominal stride division suggests. Feature-pyramid-level locations
1024
+ # still use the nominal FPN_STRIDES for FCOS box decoding because
1025
+ # that's what eager `model.detect` does.
1026
+ H = resolution // 16
1027
+ p3 = 2 * H
1028
+ p4 = H
1029
+ p5 = (H + 1) // 2
1030
+ p6 = (p5 + 1) // 2
1031
+ p7 = (p6 + 1) // 2
1032
+ feat_sizes = [(p3, p3), (p4, p4), (p5, p5), (p6, p6), (p7, p7)]
1033
+ locs_per_level = []
1034
+ for (h, w), s in zip(feat_sizes, FPN_STRIDES):
1035
+ ys = (torch.arange(h, dtype=torch.float32) + 0.5) * s
1036
+ xs = (torch.arange(w, dtype=torch.float32) + 0.5) * s
1037
+ gy, gx = torch.meshgrid(ys, xs, indexing="ij")
1038
+ locs_per_level.append(torch.stack([gx.flatten(), gy.flatten()], -1))
1039
+ all_locs = torch.cat(locs_per_level, 0)
1040
+ self.register_buffer("all_locs", all_locs)
1041
+
1042
+ def forward(self, spatial_features: Tensor):
1043
+ cls_logits, box_regs, centernesses = self.detection_head(spatial_features)
1044
+ B = spatial_features.shape[0]
1045
+
1046
+ flat_cls = torch.cat(
1047
+ [c.permute(0, 2, 3, 1).reshape(B, -1, self.num_classes) for c in cls_logits], dim=1)
1048
+ flat_reg = torch.cat(
1049
+ [r.permute(0, 2, 3, 1).reshape(B, -1, 4) for r in box_regs], dim=1)
1050
+ flat_ctr = torch.cat(
1051
+ [c.permute(0, 2, 3, 1).reshape(B, -1, 1) for c in centernesses], dim=1)
1052
+
1053
+ scores = torch.sigmoid(flat_cls) * torch.sigmoid(flat_ctr)
1054
+
1055
+ locs = self.all_locs.unsqueeze(0).expand(B, -1, -1)
1056
+ x1 = (locs[..., 0:1] - flat_reg[..., 0:1]).clamp(0, self.resolution)
1057
+ y1 = (locs[..., 1:2] - flat_reg[..., 1:2]).clamp(0, self.resolution)
1058
+ x2 = (locs[..., 0:1] + flat_reg[..., 2:3]).clamp(0, self.resolution)
1059
+ y2 = (locs[..., 1:2] + flat_reg[..., 3:4]).clamp(0, self.resolution)
1060
+ boxes = torch.cat([x1, y1, x2, y2], dim=-1)
1061
+
1062
+ if not self.include_nms:
1063
+ return boxes, scores
1064
+
1065
+ # ONNX NMS expects boxes in [y1, x1, y2, x2] (center_point_box=0) and
1066
+ # scores with the class dim in the middle: [B, C, N].
1067
+ boxes_yxyx = torch.cat([y1, x1, y2, x2], dim=-1)
1068
+ scores_bcn = scores.permute(0, 2, 1).contiguous()
1069
+
1070
+ max_out = torch.tensor(self.nms_max_detections, dtype=torch.long, device=boxes.device)
1071
+ iou_thr = torch.tensor(self.nms_iou_threshold, dtype=torch.float32, device=boxes.device)
1072
+ score_thr = torch.tensor(self.nms_score_threshold, dtype=torch.float32, device=boxes.device)
1073
+
1074
+ selected = _ONNXBatchedNMS.apply(
1075
+ boxes_yxyx, scores_bcn, max_out, iou_thr, score_thr,
1076
+ )
1077
+ batch_idx = selected[:, 0].long()
1078
+ class_idx = selected[:, 1].long()
1079
+ box_idx = selected[:, 2].long()
1080
+
1081
+ sel_boxes = boxes[batch_idx, box_idx] # [M, 4] xyxy
1082
+ sel_scores = scores[batch_idx, box_idx, class_idx] # [M]
1083
+ return sel_boxes, sel_scores, class_idx, batch_idx
1084
+
1085
+
1086
+ class SegmentationHead(nn.Module):
1087
+ def __init__(self, in_dim: int = 768, num_classes: int = 150):
1088
+ super().__init__()
1089
+ self.batchnorm_layer = nn.BatchNorm2d(in_dim)
1090
+ self.conv = nn.Conv2d(in_dim, num_classes, kernel_size=1)
1091
+
1092
+ def forward(self, x: Tensor) -> Tensor:
1093
+ return self.conv(self.batchnorm_layer(x))
1094
+
1095
+
1096
+ class DepthHead(nn.Module):
1097
+ def __init__(self, in_dim: int = 768, n_bins: int = 256,
1098
+ min_depth: float = 0.001, max_depth: float = 10.0):
1099
+ super().__init__()
1100
+ self.batchnorm_layer = nn.BatchNorm2d(in_dim)
1101
+ self.conv_depth = nn.Conv2d(in_dim, n_bins, kernel_size=1)
1102
+ self.min_depth = min_depth
1103
+ self.max_depth = max_depth
1104
+ self.n_bins = n_bins
1105
+
1106
+ def forward(self, x: Tensor) -> Tensor:
1107
+ logits = self.conv_depth(self.batchnorm_layer(x))
1108
+ logit = torch.relu(logits) + 0.1
1109
+ logit = logit / logit.sum(dim=1, keepdim=True)
1110
+ bins = torch.linspace(self.min_depth, self.max_depth, self.n_bins, device=x.device)
1111
+ return torch.einsum("bkhw,k->bhw", logit, bins).unsqueeze(1)
1112
+
1113
+
1114
+ # ===========================================================================
1115
+ # Detection (FCOS with ViTDet-style simple feature pyramid)
1116
+ # ===========================================================================
1117
+
1118
+ FPN_STRIDES = [8, 16, 32, 64, 128]
1119
+
1120
+ COCO_CLASSES = [
1121
+ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck",
1122
+ "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
1123
+ "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra",
1124
+ "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
1125
+ "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
1126
+ "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
1127
+ "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
1128
+ "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
1129
+ "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
1130
+ "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
1131
+ "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
1132
+ "toothbrush",
1133
+ ]
1134
+
1135
+
1136
+ def cofiber_decompose(f: Tensor, n_scales: int) -> List[Tensor]:
1137
+ """Iterated multi-scale decomposition. Each step subtracts the
1138
+ downsampled-then-upsampled component of the current residual and
1139
+ recurses on the remainder. Zero learned parameters. The final entry is
1140
+ the lowest-frequency remainder."""
1141
+ cofibers: List[Tensor] = []
1142
+ residual = f
1143
+ for _ in range(n_scales - 1):
1144
+ omega = F.avg_pool2d(residual, 2)
1145
+ sigma_omega = F.interpolate(omega, size=residual.shape[2:],
1146
+ mode="bilinear", align_corners=False)
1147
+ cofibers.append(residual - sigma_omega)
1148
+ residual = omega
1149
+ cofibers.append(residual)
1150
+ return cofibers
1151
+
1152
+
1153
+ def make_sin_pos_emb(H: int, W: int, dim: int, device) -> Tensor:
1154
+ """2D sinusoidal positional encoding over an H x W grid. Concatenated
1155
+ to the backbone patch features before the head stem."""
1156
+ assert dim % 4 == 0, "pos emb dim must be divisible by 4"
1157
+ d = dim // 4
1158
+ ys = torch.arange(H, device=device, dtype=torch.float32)
1159
+ xs = torch.arange(W, device=device, dtype=torch.float32)
1160
+ omega = torch.exp(torch.arange(d, device=device, dtype=torch.float32)
1161
+ * -(math.log(10000.0) / d))
1162
+ pe_y = torch.zeros(H, d * 2, device=device)
1163
+ pe_y[:, 0::2] = torch.sin(ys[:, None] * omega[None, :])
1164
+ pe_y[:, 1::2] = torch.cos(ys[:, None] * omega[None, :])
1165
+ pe_x = torch.zeros(W, d * 2, device=device)
1166
+ pe_x[:, 0::2] = torch.sin(xs[:, None] * omega[None, :])
1167
+ pe_x[:, 1::2] = torch.cos(xs[:, None] * omega[None, :])
1168
+ pos = torch.zeros(dim, H, W, device=device)
1169
+ pos[:d * 2] = pe_y.permute(1, 0)[:, :, None].expand(-1, H, W)
1170
+ pos[d * 2:] = pe_x.permute(1, 0)[None, :, :].expand(H, -1, W).permute(1, 0, 2)
1171
+ return pos.unsqueeze(0)
1172
+
1173
+
1174
+ class ConvGNBlock(nn.Module):
1175
+ def __init__(self, channels: int):
1176
+ super().__init__()
1177
+ self.conv = nn.Conv2d(channels, channels, 3, padding=1)
1178
+ self.norm = nn.GroupNorm(min(32, channels), channels)
1179
+ self.act = nn.GELU()
1180
+
1181
+ def forward(self, x: Tensor) -> Tensor:
1182
+ return self.act(self.norm(self.conv(x)))
1183
+
1184
+
1185
+ class DWResBlock(nn.Module):
1186
+ def __init__(self, channels: int):
1187
+ super().__init__()
1188
+ self.pw = nn.Conv2d(channels, channels, 1)
1189
+ self.act = nn.GELU()
1190
+ self.dw = nn.Conv2d(channels, channels, 3, padding=1, groups=channels)
1191
+ self.norm = nn.GroupNorm(min(32, channels), channels)
1192
+
1193
+ def forward(self, x: Tensor) -> Tensor:
1194
+ return x + self.norm(self.dw(self.act(self.pw(x))))
1195
+
1196
+
1197
+ def make_tower(hidden: int, n_std: int, n_dw: int) -> nn.Sequential:
1198
+ layers: List[nn.Module] = [ConvGNBlock(hidden) for _ in range(n_std)]
1199
+ layers += [DWResBlock(hidden) for _ in range(n_dw)]
1200
+ return nn.Sequential(*layers)
1201
+
1202
+
1203
+ class SplitTowerHead(nn.Module):
1204
+ """Detection head operating on a cofiber decomposition of the frozen
1205
+ backbone features. Five prediction levels (strides 8, 16, 32, 64, 128):
1206
+ a stride-8 level synthesized by a transposed convolution from the
1207
+ stride-16 band and four cofiber bands at strides 16, 32, 64, 128.
1208
+ Separate classification and regression towers of depth (n_std_layers +
1209
+ n_dw_layers) with weights shared across levels. Classification via
1210
+ cosine similarity against frozen CLIP text-encoder embeddings of the
1211
+ COCO class names; regression via exponentiated LTRB distances with a
1212
+ learned per-level scale; centerness via a single 1x1 convolution.
1213
+
1214
+ Inference-only within Argus: no DFL, no IoU-aware branch, no
1215
+ per-scale bias. The text_embed buffer is populated by from_pretrained's
1216
+ state_dict load."""
1217
+
1218
+ def __init__(self,
1219
+ feat_dim: int = 768,
1220
+ hidden: int = 160,
1221
+ n_std_layers: int = 5,
1222
+ n_dw_layers: int = 4,
1223
+ n_scales: int = 4,
1224
+ pos_emb_dim: int = 64,
1225
+ num_classes: int = 80,
1226
+ text_embed_dim: int = 768):
1227
+ super().__init__()
1228
+ self.n_scales = n_scales
1229
+ self.pos_emb_dim = pos_emb_dim
1230
+ self.num_classes = num_classes
1231
+ self.text_embed_dim = text_embed_dim
1232
+ n_total = n_scales + 1
1233
+
1234
+ input_dim = feat_dim + pos_emb_dim
1235
+ self.scale_norms = nn.ModuleList([nn.GroupNorm(1, input_dim) for _ in range(n_scales)])
1236
+ self.stem = nn.Conv2d(input_dim, hidden, 1)
1237
+ self.stem_act = nn.GELU()
1238
+ self.p3_upsample = nn.ConvTranspose2d(hidden, hidden, 2, stride=2)
1239
+ self.p3_norm = nn.GroupNorm(min(32, hidden), hidden)
1240
+ self.lateral_convs = nn.ModuleList([nn.Conv2d(hidden, hidden, 1) for _ in range(n_scales - 1)])
1241
+ self.lateral_norms = nn.ModuleList(
1242
+ [nn.GroupNorm(min(32, hidden), hidden) for _ in range(n_scales - 1)])
1243
+ self.cls_tower = make_tower(hidden, n_std_layers, n_dw_layers)
1244
+ self.reg_tower = make_tower(hidden, n_std_layers, n_dw_layers)
1245
+
1246
+ # CLIP text-aligned classifier. The text_embed buffer is filled from
1247
+ # the state dict at from_pretrained; the zero placeholder here only
1248
+ # exists so the module can be constructed before weights arrive.
1249
+ self.register_buffer("text_embed",
1250
+ torch.zeros(num_classes, text_embed_dim))
1251
+ self.cls_project = nn.Linear(hidden, text_embed_dim, bias=False)
1252
+ self.logit_scale = nn.Parameter(torch.tensor(math.log(1.0 / 0.07)))
1253
+ self.cls_bias = nn.Parameter(torch.full((num_classes,), -math.log(99)))
1254
+
1255
+ self.reg_pred = nn.Conv2d(hidden, 4, 1)
1256
+ self.ctr_pred = nn.Conv2d(hidden, 1, 1)
1257
+ self.scale_params = nn.Parameter(torch.ones(n_total))
1258
+
1259
+ def forward(self, spatial: Tensor) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]:
1260
+ B, C, H_, W_ = spatial.shape
1261
+ pos = make_sin_pos_emb(H_, W_, self.pos_emb_dim, spatial.device).expand(B, -1, -1, -1)
1262
+ spatial = torch.cat([spatial, pos], dim=1)
1263
+ cofibers = cofiber_decompose(spatial, self.n_scales)
1264
+
1265
+ scale_features: List[Tensor] = []
1266
+ for i, cof in enumerate(cofibers):
1267
+ x = self.stem_act(self.stem(self.scale_norms[i](cof)))
1268
+ scale_features.append(x)
1269
+
1270
+ # Top-down lateral fusion from coarser to finer scales.
1271
+ for i in range(len(scale_features) - 2, -1, -1):
1272
+ coarse_up = F.interpolate(scale_features[i + 1],
1273
+ size=scale_features[i].shape[2:],
1274
+ mode="bilinear", align_corners=False)
1275
+ scale_features[i] = self.lateral_norms[i](
1276
+ scale_features[i] + self.lateral_convs[i](coarse_up))
1277
+
1278
+ p3 = self.p3_norm(self.p3_upsample(scale_features[0]))
1279
+ all_features = [p3] + scale_features
1280
+
1281
+ cls_l, reg_l, ctr_l = [], [], []
1282
+ for i, x in enumerate(all_features):
1283
+ cls_feat = self.cls_tower(x)
1284
+ reg_feat = self.reg_tower(x)
1285
+
1286
+ B_, _, Hi, Wi = cls_feat.shape
1287
+ f = cls_feat.permute(0, 2, 3, 1).reshape(-1, cls_feat.shape[1])
1288
+ f_proj = self.cls_project(f)
1289
+ f_norm = F.normalize(f_proj, p=2, dim=-1)
1290
+ logits = f_norm @ self.text_embed.t()
1291
+ cls = (logits * self.logit_scale.exp() + self.cls_bias).reshape(
1292
+ B_, Hi, Wi, self.num_classes).permute(0, 3, 1, 2)
1293
+
1294
+ reg_raw = (self.reg_pred(reg_feat) * self.scale_params[i]).clamp(-10, 10)
1295
+ reg = reg_raw.exp()
1296
+ ctr = self.ctr_pred(reg_feat)
1297
+
1298
+ cls_l.append(cls)
1299
+ reg_l.append(reg)
1300
+ ctr_l.append(ctr)
1301
+
1302
+ return cls_l, reg_l, ctr_l
1303
+
1304
+
1305
+ def _make_locations(feature_sizes: List[Tuple[int, int]], strides: List[int], device) -> List[Tensor]:
1306
+ """Per-level center coordinates of feature-map locations in image space."""
1307
+ all_locs = []
1308
+ for (h, w), s in zip(feature_sizes, strides):
1309
+ ys = (torch.arange(h, device=device, dtype=torch.float32) + 0.5) * s
1310
+ xs = (torch.arange(w, device=device, dtype=torch.float32) + 0.5) * s
1311
+ grid_y, grid_x = torch.meshgrid(ys, xs, indexing="ij")
1312
+ locs = torch.stack([grid_x.flatten(), grid_y.flatten()], dim=-1)
1313
+ all_locs.append(locs)
1314
+ return all_locs
1315
+
1316
+
1317
+ @torch.inference_mode()
1318
+ def _decode_detections(
1319
+ cls_logits_per_level: List[Tensor],
1320
+ box_regs_per_level: List[Tensor],
1321
+ centernesses_per_level: List[Tensor],
1322
+ locations_per_level: List[Tensor],
1323
+ image_sizes: List[Tuple[int, int]],
1324
+ score_thresh: float = 0.05,
1325
+ nms_thresh: float = 0.5,
1326
+ max_per_level: int = 1000,
1327
+ max_per_image: int = 100,
1328
+ ) -> List[Dict[str, Tensor]]:
1329
+ """Convert per-level logits/regs/centerness into per-image detections (xyxy boxes)."""
1330
+ B = cls_logits_per_level[0].shape[0]
1331
+ num_classes = cls_logits_per_level[0].shape[1]
1332
+ device = cls_logits_per_level[0].device
1333
+
1334
+ per_image_results = []
1335
+ for image_idx in range(B):
1336
+ all_boxes, all_scores, all_labels = [], [], []
1337
+ for cls_l, reg_l, ctr_l, locs_l in zip(
1338
+ cls_logits_per_level, box_regs_per_level, centernesses_per_level, locations_per_level
1339
+ ):
1340
+ cls = cls_l[image_idx].permute(1, 2, 0).reshape(-1, num_classes)
1341
+ reg = reg_l[image_idx].permute(1, 2, 0).reshape(-1, 4)
1342
+ ctr = ctr_l[image_idx].permute(1, 2, 0).reshape(-1)
1343
+
1344
+ cls_prob = torch.sigmoid(cls)
1345
+ ctr_prob = torch.sigmoid(ctr)
1346
+ scores = cls_prob * ctr_prob[:, None]
1347
+
1348
+ mask = scores > score_thresh
1349
+ if not mask.any():
1350
+ continue
1351
+ cand_loc, cand_cls = mask.nonzero(as_tuple=True)
1352
+ cand_scores = scores[cand_loc, cand_cls]
1353
+
1354
+ if cand_scores.numel() > max_per_level:
1355
+ top = cand_scores.topk(max_per_level)
1356
+ cand_scores = top.values
1357
+ idx = top.indices
1358
+ cand_loc = cand_loc[idx]
1359
+ cand_cls = cand_cls[idx]
1360
+
1361
+ cand_locs_xy = locs_l[cand_loc]
1362
+ cand_reg = reg[cand_loc]
1363
+ boxes = torch.stack([
1364
+ cand_locs_xy[:, 0] - cand_reg[:, 0],
1365
+ cand_locs_xy[:, 1] - cand_reg[:, 1],
1366
+ cand_locs_xy[:, 0] + cand_reg[:, 2],
1367
+ cand_locs_xy[:, 1] + cand_reg[:, 3],
1368
+ ], dim=-1)
1369
+ all_boxes.append(boxes)
1370
+ all_scores.append(cand_scores)
1371
+ all_labels.append(cand_cls)
1372
+
1373
+ if all_boxes:
1374
+ boxes = torch.cat(all_boxes, dim=0)
1375
+ scores = torch.cat(all_scores, dim=0)
1376
+ labels = torch.cat(all_labels, dim=0)
1377
+
1378
+ H, W = image_sizes[image_idx]
1379
+ boxes[:, 0::2] = boxes[:, 0::2].clamp(0, W)
1380
+ boxes[:, 1::2] = boxes[:, 1::2].clamp(0, H)
1381
+
1382
+ keep_all = []
1383
+ for c in labels.unique():
1384
+ cm = labels == c
1385
+ keep = nms(boxes[cm], scores[cm], nms_thresh)
1386
+ keep_idx = cm.nonzero(as_tuple=True)[0][keep]
1387
+ keep_all.append(keep_idx)
1388
+ keep_all = torch.cat(keep_all, dim=0)
1389
+
1390
+ boxes = boxes[keep_all]
1391
+ scores = scores[keep_all]
1392
+ labels = labels[keep_all]
1393
+
1394
+ if scores.numel() > max_per_image:
1395
+ top = scores.topk(max_per_image)
1396
+ boxes = boxes[top.indices]
1397
+ scores = top.values
1398
+ labels = labels[top.indices]
1399
+ else:
1400
+ boxes = torch.zeros((0, 4), device=device)
1401
+ scores = torch.zeros((0,), device=device)
1402
+ labels = torch.zeros((0,), dtype=torch.long, device=device)
1403
+
1404
+ per_image_results.append({"boxes": boxes, "scores": scores, "labels": labels})
1405
+
1406
+ return per_image_results
1407
+
1408
+
1409
+ def _letterbox_to_square(image: Image.Image, resolution: int) -> Tuple[Image.Image, float, Tuple[int, int]]:
1410
+ """Resize preserving aspect ratio and pad bottom/right with black. Matches the training transform."""
1411
+ W0, H0 = image.size
1412
+ scale = resolution / max(H0, W0)
1413
+ new_w = int(round(W0 * scale))
1414
+ new_h = int(round(H0 * scale))
1415
+ resized = image.resize((new_w, new_h), Image.BILINEAR)
1416
+ canvas = Image.new("RGB", (resolution, resolution), (0, 0, 0))
1417
+ canvas.paste(resized, (0, 0))
1418
+ return canvas, scale, (W0, H0)
1419
+
1420
+
1421
+ # ===========================================================================
1422
+ # DPT depth decoder (multi-scale, hooks into ViT blocks [2, 5, 8, 11])
1423
+ # ===========================================================================
1424
+
1425
+ HOOK_BLOCK_INDICES = [2, 5, 8, 11]
1426
+ N_PREFIX_TOKENS = 5 # 1 CLS + 4 register/storage tokens
1427
+
1428
+
1429
+ class _ResidualConvUnit(nn.Module):
1430
+ def __init__(self, dim: int):
1431
+ super().__init__()
1432
+ self.conv1 = nn.Conv2d(dim, dim, 3, padding=1, bias=False)
1433
+ self.bn1 = nn.BatchNorm2d(dim)
1434
+ self.conv2 = nn.Conv2d(dim, dim, 3, padding=1, bias=False)
1435
+ self.bn2 = nn.BatchNorm2d(dim)
1436
+ self.act = nn.GELU()
1437
+
1438
+ def forward(self, x: Tensor) -> Tensor:
1439
+ return x + self.bn2(self.conv2(self.act(self.bn1(self.conv1(x)))))
1440
+
1441
+
1442
+ class _FeatureFusionBlock(nn.Module):
1443
+ def __init__(self, dim: int, has_skip: bool = True):
1444
+ super().__init__()
1445
+ self.rcu1 = _ResidualConvUnit(dim)
1446
+ self.rcu2 = _ResidualConvUnit(dim)
1447
+ self.skip_proj = nn.Conv2d(dim, dim, 1) if has_skip else None
1448
+
1449
+ def forward(self, x: Tensor, skip: Optional[Tensor] = None) -> Tensor:
1450
+ if skip is not None and self.skip_proj is not None:
1451
+ x = x + self.skip_proj(skip)
1452
+ x = self.rcu1(x)
1453
+ x = self.rcu2(x)
1454
+ return F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=False)
1455
+
1456
+
1457
+ class _DPTReassemble(nn.Module):
1458
+ def __init__(self, in_dim: int = 768, out_dim: int = 256):
1459
+ super().__init__()
1460
+ self.projects = nn.ModuleList([
1461
+ nn.Sequential(nn.LayerNorm(in_dim), nn.Linear(in_dim, out_dim))
1462
+ for _ in range(4)
1463
+ ])
1464
+ self.refine = nn.ModuleList([
1465
+ nn.Sequential(
1466
+ nn.Conv2d(out_dim, out_dim, 3, padding=1, bias=False),
1467
+ nn.BatchNorm2d(out_dim),
1468
+ nn.GELU(),
1469
+ )
1470
+ for _ in range(4)
1471
+ ])
1472
+
1473
+ def forward(self, intermediates: List[Tensor], H: int, W: int) -> List[Tensor]:
1474
+ out = []
1475
+ for feat, proj, refine in zip(intermediates, self.projects, self.refine):
1476
+ patches = feat[:, N_PREFIX_TOKENS:, :]
1477
+ patches = proj(patches)
1478
+ B, N, D = patches.shape
1479
+ spatial = patches.permute(0, 2, 1).reshape(B, D, H, W)
1480
+ out.append(refine(spatial))
1481
+
1482
+ level_4 = F.interpolate(out[0], scale_factor=4, mode="bilinear", align_corners=False)
1483
+ level_8 = F.interpolate(out[1], scale_factor=2, mode="bilinear", align_corners=False)
1484
+ level_16 = out[2]
1485
+ level_32 = F.interpolate(out[3], scale_factor=0.5, mode="bilinear", align_corners=False)
1486
+ return [level_4, level_8, level_16, level_32]
1487
+
1488
+
1489
+ class DPTDepthDecoder(nn.Module):
1490
+ def __init__(self, in_dim: int = 768, decoder_dim: int = 256,
1491
+ n_bins: int = 256, min_depth: float = 0.001, max_depth: float = 10.0):
1492
+ super().__init__()
1493
+ self.n_bins = n_bins
1494
+ self.min_depth = min_depth
1495
+ self.max_depth = max_depth
1496
+
1497
+ self.reassemble = _DPTReassemble(in_dim=in_dim, out_dim=decoder_dim)
1498
+ self.fusion_blocks = nn.ModuleList([
1499
+ _FeatureFusionBlock(decoder_dim, has_skip=True),
1500
+ _FeatureFusionBlock(decoder_dim, has_skip=True),
1501
+ _FeatureFusionBlock(decoder_dim, has_skip=True),
1502
+ _FeatureFusionBlock(decoder_dim, has_skip=False),
1503
+ ])
1504
+ self.head = nn.Sequential(
1505
+ nn.Conv2d(decoder_dim, decoder_dim, 3, padding=1, bias=False),
1506
+ nn.BatchNorm2d(decoder_dim),
1507
+ nn.GELU(),
1508
+ nn.Conv2d(decoder_dim, n_bins, 1),
1509
+ )
1510
+
1511
+ def forward(self, intermediates: List[Tensor], H: int, W: int,
1512
+ return_distribution: bool = False):
1513
+ levels = self.reassemble(intermediates, H, W)
1514
+ x = self.fusion_blocks[3](levels[3])
1515
+ x = self.fusion_blocks[2](x, skip=levels[2])
1516
+ x = self.fusion_blocks[1](x, skip=levels[1])
1517
+ x = self.fusion_blocks[0](x, skip=levels[0])
1518
+ logits = self.head(x)
1519
+ distribution = torch.relu(logits) + 0.1
1520
+ distribution = distribution / distribution.sum(dim=1, keepdim=True)
1521
+ bins = torch.linspace(self.min_depth, self.max_depth, self.n_bins, device=x.device)
1522
+ depth = torch.einsum("bkhw,k->bhw", distribution, bins).unsqueeze(1)
1523
+ if return_distribution:
1524
+ return depth, distribution, bins
1525
+ return depth
1526
+
1527
+
1528
+ # ===========================================================================
1529
+ # Argus model (transformers-compatible)
1530
+ # ===========================================================================
1531
+
1532
+
1533
+ class ArgusConfig(PretrainedConfig):
1534
+ model_type = "argus"
1535
+
1536
+ def __init__(
1537
+ self,
1538
+ embed_dim: int = 768,
1539
+ patch_size: int = 16,
1540
+ num_seg_classes: int = 150,
1541
+ depth_n_bins: int = 256,
1542
+ depth_min_depth: float = 0.001,
1543
+ depth_max_depth: float = 10.0,
1544
+ num_imagenet_classes: int = 1000,
1545
+ class_ids: Optional[list] = None,
1546
+ class_names: Optional[list] = None,
1547
+ detection_num_classes: int = 80,
1548
+ detection_hidden: int = 160,
1549
+ detection_n_std_layers: int = 5,
1550
+ detection_n_dw_layers: int = 4,
1551
+ detection_n_scales: int = 4,
1552
+ detection_pos_emb_dim: int = 64,
1553
+ detection_text_embed_dim: int = 768,
1554
+ detection_class_names: Optional[list] = None,
1555
+ **kwargs,
1556
+ ):
1557
+ super().__init__(**kwargs)
1558
+ self.embed_dim = embed_dim
1559
+ self.patch_size = patch_size
1560
+ self.num_seg_classes = num_seg_classes
1561
+ self.depth_n_bins = depth_n_bins
1562
+ self.depth_min_depth = depth_min_depth
1563
+ self.depth_max_depth = depth_max_depth
1564
+ self.num_imagenet_classes = num_imagenet_classes
1565
+ self.class_ids = class_ids or []
1566
+ self.class_names = class_names or []
1567
+ self.detection_num_classes = detection_num_classes
1568
+ self.detection_hidden = detection_hidden
1569
+ self.detection_n_std_layers = detection_n_std_layers
1570
+ self.detection_n_dw_layers = detection_n_dw_layers
1571
+ self.detection_n_scales = detection_n_scales
1572
+ self.detection_pos_emb_dim = detection_pos_emb_dim
1573
+ self.detection_text_embed_dim = detection_text_embed_dim
1574
+ self.detection_class_names = detection_class_names or list(COCO_CLASSES)
1575
+
1576
+
1577
+ class Argus(PreTrainedModel):
1578
+ config_class = ArgusConfig
1579
+ base_model_prefix = "argus"
1580
+ supports_gradient_checkpointing = False
1581
+ _tied_weights_keys: list = []
1582
+ all_tied_weights_keys: dict = {}
1583
+
1584
+ def __init__(self, config: ArgusConfig):
1585
+ super().__init__(config)
1586
+ self.backbone = build_eupe_vitb16()
1587
+ self.seg_head = SegmentationHead(config.embed_dim, config.num_seg_classes)
1588
+ self.depth_head = DPTDepthDecoder(
1589
+ in_dim=config.embed_dim,
1590
+ decoder_dim=256,
1591
+ n_bins=config.depth_n_bins,
1592
+ min_depth=config.depth_min_depth,
1593
+ max_depth=config.depth_max_depth,
1594
+ )
1595
+ self.register_buffer(
1596
+ "class_logit_weight",
1597
+ torch.zeros(config.num_imagenet_classes, config.embed_dim),
1598
+ persistent=True,
1599
+ )
1600
+ self.register_buffer(
1601
+ "class_logit_bias",
1602
+ torch.zeros(config.num_imagenet_classes),
1603
+ persistent=True,
1604
+ )
1605
+ self.detection_head = SplitTowerHead(
1606
+ feat_dim=config.embed_dim,
1607
+ hidden=config.detection_hidden,
1608
+ n_std_layers=config.detection_n_std_layers,
1609
+ n_dw_layers=config.detection_n_dw_layers,
1610
+ n_scales=config.detection_n_scales,
1611
+ pos_emb_dim=config.detection_pos_emb_dim,
1612
+ num_classes=config.detection_num_classes,
1613
+ text_embed_dim=config.detection_text_embed_dim,
1614
+ )
1615
+
1616
+ for p in self.backbone.parameters():
1617
+ p.requires_grad = False
1618
+ self.backbone.eval()
1619
+ self.seg_head.eval()
1620
+ self.depth_head.eval()
1621
+ self.detection_head.eval()
1622
+
1623
+ def _init_weights(self, module):
1624
+ # HF reallocates missing buffers and parameters with torch.empty()
1625
+ # (uninitialized memory) on from_pretrained. Populate sensible defaults
1626
+ # for the standard layer types used by the detection head, and zero any
1627
+ # Argus-level buffer that came back NaN.
1628
+ if isinstance(module, (nn.Conv2d, nn.ConvTranspose2d)):
1629
+ nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
1630
+ if module.bias is not None:
1631
+ nn.init.zeros_(module.bias)
1632
+ elif isinstance(module, nn.GroupNorm):
1633
+ nn.init.ones_(module.weight)
1634
+ nn.init.zeros_(module.bias)
1635
+
1636
+ if module is self:
1637
+ for name in ("class_logit_weight", "class_logit_bias"):
1638
+ if hasattr(self, name):
1639
+ buf = getattr(self, name)
1640
+ if torch.isnan(buf).any() or torch.isinf(buf).any():
1641
+ buf.data.zero_()
1642
+
1643
+ @property
1644
+ def class_ids(self):
1645
+ return self.config.class_ids
1646
+
1647
+ @property
1648
+ def class_names(self):
1649
+ return self.config.class_names
1650
+
1651
+ def quantize_int8(self):
1652
+ """Apply INT8 weight-only quantization via torchao. Reduces VRAM by ~11%
1653
+ with negligible accuracy loss (<0.05 m depth drift, 100% classification
1654
+ agreement). Requires torchao: pip install torchao."""
1655
+ try:
1656
+ from torchao.quantization import quantize_, Int8WeightOnlyConfig
1657
+ except ImportError as e:
1658
+ raise ImportError("torchao is required for INT8 quantization: pip install torchao") from e
1659
+ quantize_(self, Int8WeightOnlyConfig())
1660
+ return self
1661
+
1662
+ @torch.inference_mode()
1663
+ def _extract(self, image_tensor: Tensor) -> Tuple[Tensor, Tensor]:
1664
+ with torch.autocast(self.device.type, dtype=torch.bfloat16, enabled=self.device.type == "cuda"):
1665
+ out = self.backbone.forward_features(image_tensor)
1666
+ cls = out["x_norm_clstoken"].float()
1667
+ patches = out["x_norm_patchtokens"].float()
1668
+ B, N, D = patches.shape
1669
+ h = w = int(N ** 0.5)
1670
+ spatial = patches.permute(0, 2, 1).reshape(B, D, h, w)
1671
+ return cls, spatial
1672
+
1673
+ @torch.inference_mode()
1674
+ def classify(self, image_or_images, top_k: int = 5):
1675
+ single, images = _normalize_image_input(image_or_images)
1676
+ transform = make_eupe_transform(224)
1677
+ batch = torch.stack([transform(img) for img in images]).to(self.device)
1678
+ cls, _ = self._extract(batch)
1679
+ cls = F.normalize(cls, dim=-1)
1680
+
1681
+ w = self.class_logit_weight.to(cls.dtype)
1682
+ b = self.class_logit_bias.to(cls.dtype)
1683
+ logits = F.linear(cls, w, b)
1684
+ scores_full = F.softmax(logits, dim=-1)
1685
+
1686
+ topk = scores_full.topk(top_k, dim=-1)
1687
+ top2 = scores_full.topk(2, dim=-1)
1688
+ margins = (top2.values[:, 0] - top2.values[:, 1]).tolist()
1689
+
1690
+ results = []
1691
+ for b in range(len(images)):
1692
+ entries = []
1693
+ for score, idx in zip(topk.values[b].tolist(), topk.indices[b].tolist()):
1694
+ entries.append({
1695
+ "class_id": self.class_ids[idx],
1696
+ "class_name": self.class_names[idx],
1697
+ "score": float(score),
1698
+ })
1699
+ entries[0]["margin"] = float(margins[b])
1700
+ results.append(entries)
1701
+ return results[0] if single else results
1702
+
1703
+ @torch.inference_mode()
1704
+ def segment(self, image_or_images, resolution: int = 512, return_confidence: bool = False):
1705
+ single, images = _normalize_image_input(image_or_images)
1706
+ transform = make_eupe_transform(resolution)
1707
+ batch = torch.stack([transform(img) for img in images]).to(self.device)
1708
+ _, spatial = self._extract(batch)
1709
+ with torch.autocast(self.device.type, dtype=torch.bfloat16, enabled=self.device.type == "cuda"):
1710
+ logits = self.seg_head(spatial)
1711
+ logits = F.interpolate(logits, size=(resolution, resolution), mode="bilinear", align_corners=False)
1712
+ seg_maps = logits.argmax(dim=1) # [B, H, W]
1713
+
1714
+ if return_confidence:
1715
+ probs = F.softmax(logits.float(), dim=1)
1716
+ conf_maps = probs.max(dim=1).values # [B, H, W] in [0, 1]
1717
+ if single:
1718
+ return seg_maps[0], conf_maps[0]
1719
+ return [(seg_maps[i], conf_maps[i]) for i in range(len(images))]
1720
+
1721
+ if single:
1722
+ return seg_maps[0]
1723
+ return [seg_maps[i] for i in range(len(images))]
1724
+
1725
+ @torch.inference_mode()
1726
+ def depth(self, image_or_images, resolution: int = 416, return_confidence: bool = False):
1727
+ single, images = _normalize_image_input(image_or_images)
1728
+ transform = make_eupe_transform(resolution)
1729
+ batch = torch.stack([transform(img) for img in images]).to(self.device)
1730
+
1731
+ # Hook into intermediate ViT blocks for multi-scale features
1732
+ intermediates = {}
1733
+ hooks = []
1734
+ for idx in HOOK_BLOCK_INDICES:
1735
+ def _make_hook(block_idx):
1736
+ def _hook(module, inp, out):
1737
+ intermediates[block_idx] = out[0] if isinstance(out, list) else out
1738
+ return _hook
1739
+ hooks.append(self.backbone.blocks[idx].register_forward_hook(_make_hook(idx)))
1740
+
1741
+ with torch.autocast(self.device.type, dtype=torch.bfloat16, enabled=self.device.type == "cuda"):
1742
+ self.backbone.forward_features(batch)
1743
+ for h in hooks:
1744
+ h.remove()
1745
+
1746
+ inter_list = [intermediates[idx].float() for idx in HOOK_BLOCK_INDICES]
1747
+ H = W = resolution // 16
1748
+ if return_confidence:
1749
+ depth_b, distribution, bins = self.depth_head(
1750
+ inter_list, H, W, return_distribution=True)
1751
+ # Std of the 256-bin depth distribution: var = E[X^2] - E[X]^2.
1752
+ mean_sq = torch.einsum("bkhw,k->bhw", distribution, bins ** 2)
1753
+ variance = (mean_sq - depth_b.squeeze(1) ** 2).clamp(min=0)
1754
+ std_b = torch.sqrt(variance).unsqueeze(1)
1755
+ else:
1756
+ depth_b = self.depth_head(inter_list, H, W)
1757
+ std_b = None
1758
+
1759
+ # Crop the DPT fusion border artifact (zero-padding in the conv chain
1760
+ # produces systematically wrong edge values that compound across 4 stages)
1761
+ crop = max(4, depth_b.shape[2] // 13)
1762
+ depth_b = depth_b[:, :, crop:-crop, crop:-crop]
1763
+ depth_b = F.interpolate(depth_b, size=(resolution, resolution), mode="bilinear", align_corners=False)
1764
+ if std_b is not None:
1765
+ std_b = std_b[:, :, crop:-crop, crop:-crop]
1766
+ std_b = F.interpolate(std_b, size=(resolution, resolution), mode="bilinear", align_corners=False)
1767
+
1768
+ depth_squeezed = depth_b[:, 0].float()
1769
+
1770
+ if return_confidence:
1771
+ std_squeezed = std_b[:, 0].float()
1772
+ if single:
1773
+ return depth_squeezed[0], std_squeezed[0]
1774
+ return [(depth_squeezed[i], std_squeezed[i]) for i in range(len(images))]
1775
+
1776
+ if single:
1777
+ return depth_squeezed[0]
1778
+ return [depth_squeezed[i] for i in range(len(images))]
1779
+
1780
+ @torch.inference_mode()
1781
+ def correspond(
1782
+ self,
1783
+ src_image: Image.Image,
1784
+ tgt_image: Image.Image,
1785
+ src_keypoints: list,
1786
+ resolution: int = 512,
1787
+ ):
1788
+ sw, sh = src_image.size
1789
+ tw, th = tgt_image.size
1790
+ transform = make_eupe_transform(resolution)
1791
+ src_t = transform(src_image).unsqueeze(0).to(self.device)
1792
+ tgt_t = transform(tgt_image).unsqueeze(0).to(self.device)
1793
+
1794
+ _, src_feats = self._extract(src_t)
1795
+ _, tgt_feats = self._extract(tgt_t)
1796
+
1797
+ src_feats = F.interpolate(src_feats, size=(resolution, resolution), mode="bilinear", align_corners=False)
1798
+ tgt_feats = F.interpolate(tgt_feats, size=(resolution, resolution), mode="bilinear", align_corners=False)
1799
+
1800
+ src_feats = F.normalize(src_feats[0].permute(1, 2, 0), dim=-1)
1801
+ tgt_feats = F.normalize(tgt_feats[0].permute(1, 2, 0), dim=-1)
1802
+
1803
+ preds = []
1804
+ for kp in src_keypoints:
1805
+ sx = min(max(int(kp[0] / sw * resolution), 0), resolution - 1)
1806
+ sy = min(max(int(kp[1] / sh * resolution), 0), resolution - 1)
1807
+ src_vec = src_feats[sy, sx]
1808
+ sim_map = torch.einsum("d,hwd->hw", src_vec, tgt_feats)
1809
+ flat = sim_map.argmax().item()
1810
+ py, px = flat // resolution, flat % resolution
1811
+ preds.append([px / resolution * tw, py / resolution * th])
1812
+ return preds
1813
+
1814
+ @torch.inference_mode()
1815
+ def detect(
1816
+ self,
1817
+ image_or_images,
1818
+ resolution: int = 768,
1819
+ score_thresh: float = 0.05,
1820
+ nms_thresh: float = 0.5,
1821
+ max_per_image: int = 100,
1822
+ ):
1823
+ single, images = _normalize_image_input(image_or_images)
1824
+
1825
+ # Letterbox each image to match the training transform (resize long side
1826
+ # to `resolution`, pad bottom/right with black). Box coordinates are
1827
+ # recovered after decoding by unscaling.
1828
+ canvases, scales, orig_sizes = [], [], []
1829
+ for img in images:
1830
+ canvas, scale, orig = _letterbox_to_square(img, resolution)
1831
+ canvases.append(canvas)
1832
+ scales.append(scale)
1833
+ orig_sizes.append(orig)
1834
+
1835
+ det_normalize = v2.Compose([
1836
+ v2.ToImage(),
1837
+ v2.ToDtype(torch.float32, scale=True),
1838
+ v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
1839
+ ])
1840
+ batch = torch.stack([det_normalize(c) for c in canvases]).to(self.device)
1841
+
1842
+ _, spatial = self._extract(batch)
1843
+ with torch.autocast(self.device.type, dtype=torch.bfloat16, enabled=self.device.type == "cuda"):
1844
+ cls_logits, box_regs, centernesses = self.detection_head(spatial)
1845
+ cls_logits = [c.float() for c in cls_logits]
1846
+ box_regs = [b.float() for b in box_regs]
1847
+ centernesses = [c.float() for c in centernesses]
1848
+
1849
+ feature_sizes = [(cl.shape[2], cl.shape[3]) for cl in cls_logits]
1850
+ locations = _make_locations(feature_sizes, FPN_STRIDES, spatial.device)
1851
+ image_sizes = [(resolution, resolution)] * len(images)
1852
+
1853
+ results = _decode_detections(
1854
+ cls_logits, box_regs, centernesses, locations,
1855
+ image_sizes=image_sizes,
1856
+ score_thresh=score_thresh,
1857
+ nms_thresh=nms_thresh,
1858
+ max_per_image=max_per_image,
1859
+ )
1860
+
1861
+ class_names = self.config.detection_class_names
1862
+ formatted = []
1863
+ for i, r in enumerate(results):
1864
+ scale = scales[i]
1865
+ orig_w, orig_h = orig_sizes[i]
1866
+ boxes = r["boxes"].cpu().numpy() / scale
1867
+ boxes[:, 0::2] = boxes[:, 0::2].clip(0, orig_w)
1868
+ boxes[:, 1::2] = boxes[:, 1::2].clip(0, orig_h)
1869
+
1870
+ detections = []
1871
+ for box, score, label in zip(
1872
+ boxes, r["scores"].cpu().numpy(), r["labels"].cpu().numpy()
1873
+ ):
1874
+ detections.append({
1875
+ "box": [float(v) for v in box.tolist()],
1876
+ "score": float(score),
1877
+ "label": int(label),
1878
+ "class_name": class_names[int(label)] if int(label) < len(class_names) else f"class_{int(label)}",
1879
+ })
1880
+ formatted.append(detections)
1881
+
1882
+ return formatted[0] if single else formatted
1883
+
1884
+ def perceive(self, image_or_images, return_confidence: bool = False):
1885
+ single, images = _normalize_image_input(image_or_images)
1886
+
1887
+ t0 = time.time()
1888
+ classif = self.classify(images, top_k=5)
1889
+ t1 = time.time()
1890
+ seg_out = self.segment(images, resolution=512, return_confidence=return_confidence)
1891
+ t2 = time.time()
1892
+ depth_out = self.depth(images, resolution=416, return_confidence=return_confidence)
1893
+ t3 = time.time()
1894
+
1895
+ if return_confidence:
1896
+ seg_maps = [s for s, _ in seg_out]
1897
+ seg_confs = [c for _, c in seg_out]
1898
+ depth_maps = [d for d, _ in depth_out]
1899
+ depth_uncerts = [u for _, u in depth_out]
1900
+ else:
1901
+ seg_maps = seg_out
1902
+ depth_maps = depth_out
1903
+ seg_confs = depth_uncerts = None
1904
+
1905
+ timings = {
1906
+ "classify": (t1 - t0) * 1000,
1907
+ "segment": (t2 - t1) * 1000,
1908
+ "depth": (t3 - t2) * 1000,
1909
+ "total": (t3 - t0) * 1000,
1910
+ }
1911
+
1912
+ results = []
1913
+ for i in range(len(images)):
1914
+ entry = {
1915
+ "classification": classif[i],
1916
+ "segmentation": seg_maps[i].cpu().numpy(),
1917
+ "depth": depth_maps[i].cpu().numpy(),
1918
+ "timings_ms": timings,
1919
+ }
1920
+ if return_confidence:
1921
+ entry["segmentation_confidence"] = seg_confs[i].cpu().numpy()
1922
+ entry["depth_uncertainty"] = depth_uncerts[i].cpu().numpy()
1923
+ results.append(entry)
1924
+ return results[0] if single else results
1925
+
1926
+ def export_onnx(
1927
+ self,
1928
+ out_dir: str,
1929
+ backbone_resolution: int = 224,
1930
+ dynamic_batch: bool = True,
1931
+ verify: bool = True,
1932
+ tolerance: Union[float, Dict[str, float]] = 5e-2,
1933
+ opset_version: int = 17,
1934
+ include_nms: bool = False,
1935
+ nms_iou_threshold: float = 0.5,
1936
+ nms_score_threshold: float = 0.05,
1937
+ nms_max_detections: int = 100,
1938
+ ) -> dict:
1939
+ """Export backbone, classifier, seg head, depth head, and detection head to ONNX.
1940
+
1941
+ Produces five graphs:
1942
+ - argus_backbone.onnx image[B,3,H,W] -> cls[B,D], spatial[B,D,H/16,W/16]
1943
+ - argus_classifier.onnx cls_token[B,D] -> probs[B,1000]
1944
+ - argus_seg_head.onnx spatial_features[B,D,h,w] -> seg_logits[B,150,H,W]
1945
+ - argus_depth_head.onnx intermediate_{0..3}[B,N+5,D] -> depth_map[B,1,~8h,~8w]
1946
+ - argus_detection_head.onnx spatial_features[B,D,h,w] -> boxes, scores (+ labels, batch_indices if include_nms)
1947
+
1948
+ The seg graph folds bilinear upsample to input resolution into the
1949
+ graph, so consumers argmax directly without a separate interpolation
1950
+ step. Correspondence has no learned parameters — it runs as
1951
+ cosine-max on the backbone's spatial output and needs no graph.
1952
+
1953
+ ``include_nms=True`` bakes an ONNX NonMaxSuppression (opset >= 10)
1954
+ op into the detection head. The detection graph then emits four
1955
+ post-NMS tensors (boxes [M,4], scores [M], class_labels [M],
1956
+ batch_indices [M]) instead of the raw (boxes, scores) pair. Useful
1957
+ for single-shot TensorRT / mobile inference. The default
1958
+ ``include_nms=False`` leaves NMS to the consumer so they can choose
1959
+ hard vs soft, per-class vs global, and tune thresholds without
1960
+ re-exporting.
1961
+
1962
+ ``tolerance`` can be a float (applied uniformly to every
1963
+ ``*_max_diff`` check) or a dict keyed by verification output name
1964
+ (e.g. ``{"detection_boxes_max_diff": 3.2, "default": 5e-2}``). The
1965
+ ``"default"`` key covers outputs not otherwise listed. If a float
1966
+ is passed, detection box coordinates get a resolution-scaled
1967
+ tolerance (``max(tolerance, backbone_resolution * 5e-3)``) because
1968
+ exp() in the FCOS regression path amplifies FP kernel-dispatch
1969
+ differences to pixel-scale absolute diffs.
1970
+ """
1971
+ import os
1972
+ os.makedirs(out_dir, exist_ok=True)
1973
+
1974
+ if backbone_resolution % self.config.patch_size != 0:
1975
+ raise ValueError(
1976
+ f"backbone_resolution ({backbone_resolution}) must be a multiple of patch_size ({self.config.patch_size})"
1977
+ )
1978
+ spatial_resolution = backbone_resolution // self.config.patch_size
1979
+
1980
+ if backbone_resolution < 320:
1981
+ import warnings
1982
+ warnings.warn(
1983
+ f"backbone_resolution={backbone_resolution} is below 320; the detection "
1984
+ f"head's coarsest FPN level (stride 128) collapses to <=2 locations per "
1985
+ f"side and the detection graph, while it exports and runs, cannot produce "
1986
+ f"useful detections at this resolution. Classifier, seg, and depth graphs "
1987
+ f"are unaffected. FCOS convention is 640-800px input; export at "
1988
+ f">= 512 for detection.",
1989
+ stacklevel=2,
1990
+ )
1991
+
1992
+ wrapper = _BackboneExportWrapper(self.backbone).to(self.device).eval()
1993
+
1994
+ dummy_image = torch.randn(
1995
+ 1, 3, backbone_resolution, backbone_resolution,
1996
+ device=self.device, dtype=torch.float32,
1997
+ )
1998
+ dummy_spatial = torch.randn(
1999
+ 1, self.config.embed_dim, spatial_resolution, spatial_resolution,
2000
+ device=self.device, dtype=torch.float32,
2001
+ )
2002
+
2003
+ backbone_path = os.path.join(out_dir, "argus_backbone.onnx")
2004
+ classifier_path = os.path.join(out_dir, "argus_classifier.onnx")
2005
+ seg_path = os.path.join(out_dir, "argus_seg_head.onnx")
2006
+ depth_path = os.path.join(out_dir, "argus_depth_head.onnx")
2007
+ detection_path = os.path.join(out_dir, "argus_detection_head.onnx")
2008
+
2009
+ backbone_axes = None
2010
+ head_axes = None
2011
+ if dynamic_batch:
2012
+ backbone_axes = {
2013
+ "image": {0: "batch"},
2014
+ "cls_token": {0: "batch"},
2015
+ "spatial_features": {0: "batch"},
2016
+ }
2017
+ head_axes = {
2018
+ "spatial_features": {0: "batch"},
2019
+ "seg_logits": {0: "batch"},
2020
+ "depth_map": {0: "batch"},
2021
+ }
2022
+
2023
+ # dynamo path crashes on EUPE's list-based forward; use legacy.
2024
+ with torch.inference_mode():
2025
+ torch.onnx.export(
2026
+ wrapper, dummy_image, backbone_path,
2027
+ input_names=["image"],
2028
+ output_names=["cls_token", "spatial_features"],
2029
+ dynamic_axes=backbone_axes,
2030
+ opset_version=opset_version,
2031
+ do_constant_folding=True,
2032
+ dynamo=False,
2033
+ )
2034
+ seg_wrapper = _SegHeadExportWrapper(self.seg_head, backbone_resolution).to(self.device).eval()
2035
+ torch.onnx.export(
2036
+ seg_wrapper, dummy_spatial, seg_path,
2037
+ input_names=["spatial_features"],
2038
+ output_names=["seg_logits"],
2039
+ dynamic_axes={"spatial_features": head_axes["spatial_features"], "seg_logits": head_axes["seg_logits"]} if head_axes else None,
2040
+ opset_version=opset_version,
2041
+ do_constant_folding=True,
2042
+ dynamo=False,
2043
+ )
2044
+ depth_wrapper = _DepthHeadExportWrapper(
2045
+ self.depth_head, spatial_resolution, spatial_resolution
2046
+ ).to(self.device).eval()
2047
+ num_patch_tokens = spatial_resolution * spatial_resolution + N_PREFIX_TOKENS
2048
+ dummy_inter = tuple(
2049
+ torch.randn(1, num_patch_tokens, self.config.embed_dim,
2050
+ device=self.device, dtype=torch.float32)
2051
+ for _ in range(len(HOOK_BLOCK_INDICES))
2052
+ )
2053
+ depth_input_names = [f"intermediate_{i}" for i in range(len(HOOK_BLOCK_INDICES))]
2054
+ if dynamic_batch:
2055
+ depth_axes = {name: {0: "batch"} for name in depth_input_names}
2056
+ depth_axes["depth_map"] = {0: "batch"}
2057
+ else:
2058
+ depth_axes = None
2059
+ torch.onnx.export(
2060
+ depth_wrapper, dummy_inter, depth_path,
2061
+ input_names=depth_input_names,
2062
+ output_names=["depth_map"],
2063
+ dynamic_axes=depth_axes,
2064
+ opset_version=opset_version,
2065
+ do_constant_folding=True,
2066
+ dynamo=False,
2067
+ )
2068
+
2069
+ classifier_wrapper = _ClassifierExportWrapper(
2070
+ self.class_logit_weight, self.class_logit_bias
2071
+ ).to(self.device).eval()
2072
+ dummy_cls = torch.randn(
2073
+ 1, self.config.embed_dim, device=self.device, dtype=torch.float32,
2074
+ )
2075
+ if dynamic_batch:
2076
+ classifier_axes = {"cls_token": {0: "batch"}, "class_probs": {0: "batch"}}
2077
+ else:
2078
+ classifier_axes = None
2079
+ torch.onnx.export(
2080
+ classifier_wrapper, dummy_cls, classifier_path,
2081
+ input_names=["cls_token"],
2082
+ output_names=["class_probs"],
2083
+ dynamic_axes=classifier_axes,
2084
+ opset_version=opset_version,
2085
+ do_constant_folding=True,
2086
+ dynamo=False,
2087
+ )
2088
+
2089
+ detection_wrapper = _DetectionHeadExportWrapper(
2090
+ self.detection_head, backbone_resolution,
2091
+ include_nms=include_nms,
2092
+ nms_iou_threshold=nms_iou_threshold,
2093
+ nms_score_threshold=nms_score_threshold,
2094
+ nms_max_detections=nms_max_detections,
2095
+ ).to(self.device).eval()
2096
+ if include_nms:
2097
+ detection_output_names = ["boxes", "scores", "class_labels", "batch_indices"]
2098
+ # Post-NMS outputs are flat [M, ...]; no fixed batch axis to mark.
2099
+ # Spatial features input still has a dynamic batch dim so the graph
2100
+ # supports multi-image inference even with fused NMS.
2101
+ detection_axes = {"spatial_features": {0: "batch"}} if dynamic_batch else None
2102
+ else:
2103
+ detection_output_names = ["boxes", "scores"]
2104
+ if dynamic_batch:
2105
+ detection_axes = {
2106
+ "spatial_features": {0: "batch"},
2107
+ "boxes": {0: "batch"},
2108
+ "scores": {0: "batch"},
2109
+ }
2110
+ else:
2111
+ detection_axes = None
2112
+ torch.onnx.export(
2113
+ detection_wrapper, dummy_spatial, detection_path,
2114
+ input_names=["spatial_features"],
2115
+ output_names=detection_output_names,
2116
+ dynamic_axes=detection_axes,
2117
+ opset_version=opset_version,
2118
+ do_constant_folding=True,
2119
+ dynamo=False,
2120
+ )
2121
+
2122
+ result = {
2123
+ "backbone": backbone_path,
2124
+ "classifier": classifier_path,
2125
+ "seg_head": seg_path,
2126
+ "depth_head": depth_path,
2127
+ "detection_head": detection_path,
2128
+ }
2129
+
2130
+ if verify:
2131
+ try:
2132
+ import onnxruntime as ort
2133
+ except ImportError as e:
2134
+ raise ImportError("onnxruntime not installed; pip install onnxruntime") from e
2135
+
2136
+ providers = ["CPUExecutionProvider"]
2137
+ verify_image = torch.randn(2, 3, backbone_resolution, backbone_resolution, dtype=torch.float32)
2138
+ verify_spatial = torch.randn(2, self.config.embed_dim, spatial_resolution, spatial_resolution, dtype=torch.float32)
2139
+ verify_cls = torch.randn(2, self.config.embed_dim, dtype=torch.float32)
2140
+ verify_inter = [
2141
+ torch.randn(2, num_patch_tokens, self.config.embed_dim, dtype=torch.float32)
2142
+ for _ in range(len(HOOK_BLOCK_INDICES))
2143
+ ]
2144
+
2145
+ with torch.inference_mode():
2146
+ ref_cls, ref_spatial = wrapper(verify_image.to(self.device))
2147
+ ref_seg = seg_wrapper(verify_spatial.to(self.device))
2148
+ ref_depth = depth_wrapper(*[v.to(self.device) for v in verify_inter])
2149
+ ref_probs = classifier_wrapper(verify_cls.to(self.device))
2150
+ ref_det = detection_wrapper(verify_spatial.to(self.device))
2151
+
2152
+ sess = ort.InferenceSession(backbone_path, providers=providers)
2153
+ ort_cls, ort_spatial = sess.run(None, {"image": verify_image.numpy()})
2154
+ cls_diff = float(np.abs(ort_cls - ref_cls.cpu().numpy()).max())
2155
+ spatial_diff = float(np.abs(ort_spatial - ref_spatial.cpu().numpy()).max())
2156
+
2157
+ sess = ort.InferenceSession(seg_path, providers=providers)
2158
+ ort_seg = sess.run(None, {"spatial_features": verify_spatial.numpy()})[0]
2159
+ seg_diff = float(np.abs(ort_seg - ref_seg.cpu().numpy()).max())
2160
+
2161
+ sess = ort.InferenceSession(depth_path, providers=providers)
2162
+ ort_depth = sess.run(None, {f"intermediate_{i}": verify_inter[i].numpy()
2163
+ for i in range(len(HOOK_BLOCK_INDICES))})[0]
2164
+ depth_diff = float(np.abs(ort_depth - ref_depth.cpu().numpy()).max())
2165
+
2166
+ sess = ort.InferenceSession(classifier_path, providers=providers)
2167
+ ort_probs = sess.run(None, {"cls_token": verify_cls.numpy()})[0]
2168
+ classifier_diff = float(np.abs(ort_probs - ref_probs.cpu().numpy()).max())
2169
+
2170
+ sess = ort.InferenceSession(detection_path, providers=providers)
2171
+ ort_det = sess.run(None, {"spatial_features": verify_spatial.numpy()})
2172
+
2173
+ verification = {
2174
+ "backbone_cls_max_diff": cls_diff,
2175
+ "backbone_spatial_max_diff": spatial_diff,
2176
+ "classifier_max_diff": classifier_diff,
2177
+ "seg_head_max_diff": seg_diff,
2178
+ "depth_head_max_diff": depth_diff,
2179
+ "verified_batch_size": 2,
2180
+ }
2181
+
2182
+ if include_nms:
2183
+ # NMS is inherently implementation-dependent: ONNX's
2184
+ # NonMaxSuppression and the torchvision eager fallback differ
2185
+ # on tie-breaking when multiple detections share a score or
2186
+ # when near-threshold boxes are right at the score cutoff.
2187
+ # Element-wise comparison of post-NMS outputs is the wrong
2188
+ # metric. The structural checks below verify the graph runs,
2189
+ # returns reasonable shapes, and agrees on the top detection.
2190
+ pt_boxes, pt_scores, pt_labels, _ = ref_det
2191
+ ort_boxes, ort_scores, ort_labels, _ = ort_det
2192
+ pt_n = int(pt_scores.shape[0])
2193
+ ort_n = int(ort_scores.shape[0])
2194
+ verification["detection_nms_ref_count"] = pt_n
2195
+ verification["detection_nms_ort_count"] = ort_n
2196
+ if pt_n > 0 and ort_n > 0:
2197
+ pt_top = int(pt_scores.cpu().numpy().argmax())
2198
+ ort_top = int(ort_scores.argmax())
2199
+ pt_top_box = pt_boxes[pt_top].cpu().numpy()
2200
+ ort_top_box = ort_boxes[ort_top]
2201
+ # IoU of the two top boxes
2202
+ x1 = max(pt_top_box[0], ort_top_box[0])
2203
+ y1 = max(pt_top_box[1], ort_top_box[1])
2204
+ x2 = min(pt_top_box[2], ort_top_box[2])
2205
+ y2 = min(pt_top_box[3], ort_top_box[3])
2206
+ inter = max(0.0, x2 - x1) * max(0.0, y2 - y1)
2207
+ pt_area = max(0.0, pt_top_box[2] - pt_top_box[0]) * max(0.0, pt_top_box[3] - pt_top_box[1])
2208
+ ort_area = max(0.0, ort_top_box[2] - ort_top_box[0]) * max(0.0, ort_top_box[3] - ort_top_box[1])
2209
+ union = max(1e-6, pt_area + ort_area - inter)
2210
+ verification["detection_nms_top_iou"] = float(inter / union)
2211
+ verification["detection_nms_top_class_match"] = bool(
2212
+ int(pt_labels[pt_top].cpu()) == int(ort_labels[ort_top])
2213
+ )
2214
+ verification["detection_nms_top_score_diff"] = float(abs(
2215
+ float(pt_scores[pt_top].cpu()) - float(ort_scores[ort_top])
2216
+ ))
2217
+ else:
2218
+ verification["detection_nms_top_iou"] = None
2219
+ verification["detection_nms_top_class_match"] = None
2220
+ verification["detection_nms_top_score_diff"] = None
2221
+ else:
2222
+ ort_boxes, ort_scores = ort_det
2223
+ ref_boxes, ref_scores = ref_det
2224
+ verification["detection_boxes_max_diff"] = float(
2225
+ np.abs(ort_boxes - ref_boxes.cpu().numpy()).max())
2226
+ verification["detection_scores_max_diff"] = float(
2227
+ np.abs(ort_scores - ref_scores.cpu().numpy()).max())
2228
+
2229
+ # Tolerance resolution: either a float applied uniformly, or a dict
2230
+ # keyed by verification output name (with optional "default" key).
2231
+ # Detection boxes get a resolution-scaled tolerance when only a
2232
+ # float is supplied — exp() in the FCOS regression path amplifies
2233
+ # FP kernel-dispatch differences to pixel-scale absolute diffs.
2234
+ if isinstance(tolerance, dict):
2235
+ default_tol = float(tolerance.get("default", 5e-2))
2236
+ def _tol_for(key):
2237
+ return float(tolerance.get(key, default_tol))
2238
+ verification["tolerance"] = dict(tolerance)
2239
+ else:
2240
+ base = float(tolerance)
2241
+ box_tol = max(base, backbone_resolution * 5e-3)
2242
+ def _tol_for(key):
2243
+ return box_tol if key == "detection_boxes_max_diff" else base
2244
+ verification["tolerance"] = base
2245
+ verification["detection_boxes_tolerance"] = box_tol
2246
+
2247
+ for key, val in list(verification.items()):
2248
+ if not key.endswith("_max_diff"):
2249
+ continue
2250
+ t = _tol_for(key)
2251
+ if val > t:
2252
+ raise RuntimeError(
2253
+ f"ONNX/PyTorch divergence in {key}: {val:.2e} > tolerance {t:.2e}"
2254
+ )
2255
+ result["verification"] = verification
2256
+
2257
+ return result
config.json ADDED
@@ -0,0 +1,2029 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Argus"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "argus.ArgusConfig",
7
+ "AutoModel": "argus.Argus"
8
+ },
9
+ "model_type": "argus",
10
+ "embed_dim": 768,
11
+ "patch_size": 16,
12
+ "num_seg_classes": 150,
13
+ "depth_n_bins": 256,
14
+ "depth_min_depth": 0.001,
15
+ "depth_max_depth": 10.0,
16
+ "num_imagenet_classes": 1000,
17
+ "class_ids": [
18
+ "n01440764",
19
+ "n01443537",
20
+ "n01484850",
21
+ "n01491361",
22
+ "n01494475",
23
+ "n01496331",
24
+ "n01498041",
25
+ "n01514668",
26
+ "n01514859",
27
+ "n01518878",
28
+ "n01530575",
29
+ "n01531178",
30
+ "n01532829",
31
+ "n01534433",
32
+ "n01537544",
33
+ "n01558993",
34
+ "n01560419",
35
+ "n01580077",
36
+ "n01582220",
37
+ "n01592084",
38
+ "n01601694",
39
+ "n01608432",
40
+ "n01614925",
41
+ "n01616318",
42
+ "n01622779",
43
+ "n01629819",
44
+ "n01630670",
45
+ "n01631663",
46
+ "n01632458",
47
+ "n01632777",
48
+ "n01641577",
49
+ "n01644373",
50
+ "n01644900",
51
+ "n01664065",
52
+ "n01665541",
53
+ "n01667114",
54
+ "n01667778",
55
+ "n01669191",
56
+ "n01675722",
57
+ "n01677366",
58
+ "n01682714",
59
+ "n01685808",
60
+ "n01687978",
61
+ "n01688243",
62
+ "n01689811",
63
+ "n01692333",
64
+ "n01693334",
65
+ "n01694178",
66
+ "n01695060",
67
+ "n01697457",
68
+ "n01698640",
69
+ "n01704323",
70
+ "n01728572",
71
+ "n01728920",
72
+ "n01729322",
73
+ "n01729977",
74
+ "n01734418",
75
+ "n01735189",
76
+ "n01737021",
77
+ "n01739381",
78
+ "n01740131",
79
+ "n01742172",
80
+ "n01744401",
81
+ "n01748264",
82
+ "n01749939",
83
+ "n01751748",
84
+ "n01753488",
85
+ "n01755581",
86
+ "n01756291",
87
+ "n01768244",
88
+ "n01770081",
89
+ "n01770393",
90
+ "n01773157",
91
+ "n01773549",
92
+ "n01773797",
93
+ "n01774384",
94
+ "n01774750",
95
+ "n01775062",
96
+ "n01776313",
97
+ "n01784675",
98
+ "n01795545",
99
+ "n01796340",
100
+ "n01797886",
101
+ "n01798484",
102
+ "n01806143",
103
+ "n01806567",
104
+ "n01807496",
105
+ "n01817953",
106
+ "n01818515",
107
+ "n01819313",
108
+ "n01820546",
109
+ "n01824575",
110
+ "n01828970",
111
+ "n01829413",
112
+ "n01833805",
113
+ "n01843065",
114
+ "n01843383",
115
+ "n01847000",
116
+ "n01855032",
117
+ "n01855672",
118
+ "n01860187",
119
+ "n01871265",
120
+ "n01872401",
121
+ "n01873310",
122
+ "n01877812",
123
+ "n01882714",
124
+ "n01883070",
125
+ "n01910747",
126
+ "n01914609",
127
+ "n01917289",
128
+ "n01924916",
129
+ "n01930112",
130
+ "n01943899",
131
+ "n01944390",
132
+ "n01945685",
133
+ "n01950731",
134
+ "n01955084",
135
+ "n01968897",
136
+ "n01978287",
137
+ "n01978455",
138
+ "n01980166",
139
+ "n01981276",
140
+ "n01983481",
141
+ "n01984695",
142
+ "n01985128",
143
+ "n01986214",
144
+ "n01990800",
145
+ "n02002556",
146
+ "n02002724",
147
+ "n02006656",
148
+ "n02007558",
149
+ "n02009229",
150
+ "n02009912",
151
+ "n02011460",
152
+ "n02012849",
153
+ "n02013706",
154
+ "n02017213",
155
+ "n02018207",
156
+ "n02018795",
157
+ "n02025239",
158
+ "n02027492",
159
+ "n02028035",
160
+ "n02033041",
161
+ "n02037110",
162
+ "n02051845",
163
+ "n02056570",
164
+ "n02058221",
165
+ "n02066245",
166
+ "n02071294",
167
+ "n02074367",
168
+ "n02077923",
169
+ "n02085620",
170
+ "n02085782",
171
+ "n02085936",
172
+ "n02086079",
173
+ "n02086240",
174
+ "n02086646",
175
+ "n02086910",
176
+ "n02087046",
177
+ "n02087394",
178
+ "n02088094",
179
+ "n02088238",
180
+ "n02088364",
181
+ "n02088466",
182
+ "n02088632",
183
+ "n02089078",
184
+ "n02089867",
185
+ "n02089973",
186
+ "n02090379",
187
+ "n02090622",
188
+ "n02090721",
189
+ "n02091032",
190
+ "n02091134",
191
+ "n02091244",
192
+ "n02091467",
193
+ "n02091635",
194
+ "n02091831",
195
+ "n02092002",
196
+ "n02092339",
197
+ "n02093256",
198
+ "n02093428",
199
+ "n02093647",
200
+ "n02093754",
201
+ "n02093859",
202
+ "n02093991",
203
+ "n02094114",
204
+ "n02094258",
205
+ "n02094433",
206
+ "n02095314",
207
+ "n02095570",
208
+ "n02095889",
209
+ "n02096051",
210
+ "n02096177",
211
+ "n02096294",
212
+ "n02096437",
213
+ "n02096585",
214
+ "n02097047",
215
+ "n02097130",
216
+ "n02097209",
217
+ "n02097298",
218
+ "n02097474",
219
+ "n02097658",
220
+ "n02098105",
221
+ "n02098286",
222
+ "n02098413",
223
+ "n02099267",
224
+ "n02099429",
225
+ "n02099601",
226
+ "n02099712",
227
+ "n02099849",
228
+ "n02100236",
229
+ "n02100583",
230
+ "n02100735",
231
+ "n02100877",
232
+ "n02101006",
233
+ "n02101388",
234
+ "n02101556",
235
+ "n02102040",
236
+ "n02102177",
237
+ "n02102318",
238
+ "n02102480",
239
+ "n02102973",
240
+ "n02104029",
241
+ "n02104365",
242
+ "n02105056",
243
+ "n02105162",
244
+ "n02105251",
245
+ "n02105412",
246
+ "n02105505",
247
+ "n02105641",
248
+ "n02105855",
249
+ "n02106030",
250
+ "n02106166",
251
+ "n02106382",
252
+ "n02106550",
253
+ "n02106662",
254
+ "n02107142",
255
+ "n02107312",
256
+ "n02107574",
257
+ "n02107683",
258
+ "n02107908",
259
+ "n02108000",
260
+ "n02108089",
261
+ "n02108422",
262
+ "n02108551",
263
+ "n02108915",
264
+ "n02109047",
265
+ "n02109525",
266
+ "n02109961",
267
+ "n02110063",
268
+ "n02110185",
269
+ "n02110341",
270
+ "n02110627",
271
+ "n02110806",
272
+ "n02110958",
273
+ "n02111129",
274
+ "n02111277",
275
+ "n02111500",
276
+ "n02111889",
277
+ "n02112018",
278
+ "n02112137",
279
+ "n02112350",
280
+ "n02112706",
281
+ "n02113023",
282
+ "n02113186",
283
+ "n02113624",
284
+ "n02113712",
285
+ "n02113799",
286
+ "n02113978",
287
+ "n02114367",
288
+ "n02114548",
289
+ "n02114712",
290
+ "n02114855",
291
+ "n02115641",
292
+ "n02115913",
293
+ "n02116738",
294
+ "n02117135",
295
+ "n02119022",
296
+ "n02119789",
297
+ "n02120079",
298
+ "n02120505",
299
+ "n02123045",
300
+ "n02123159",
301
+ "n02123394",
302
+ "n02123597",
303
+ "n02124075",
304
+ "n02125311",
305
+ "n02127052",
306
+ "n02128385",
307
+ "n02128757",
308
+ "n02128925",
309
+ "n02129165",
310
+ "n02129604",
311
+ "n02130308",
312
+ "n02132136",
313
+ "n02133161",
314
+ "n02134084",
315
+ "n02134418",
316
+ "n02137549",
317
+ "n02138441",
318
+ "n02165105",
319
+ "n02165456",
320
+ "n02167151",
321
+ "n02168699",
322
+ "n02169497",
323
+ "n02172182",
324
+ "n02174001",
325
+ "n02177972",
326
+ "n02190166",
327
+ "n02206856",
328
+ "n02219486",
329
+ "n02226429",
330
+ "n02229544",
331
+ "n02231487",
332
+ "n02233338",
333
+ "n02236044",
334
+ "n02256656",
335
+ "n02259212",
336
+ "n02264363",
337
+ "n02268443",
338
+ "n02268853",
339
+ "n02276258",
340
+ "n02277742",
341
+ "n02279972",
342
+ "n02280649",
343
+ "n02281406",
344
+ "n02281787",
345
+ "n02317335",
346
+ "n02319095",
347
+ "n02321529",
348
+ "n02325366",
349
+ "n02326432",
350
+ "n02328150",
351
+ "n02342885",
352
+ "n02346627",
353
+ "n02356798",
354
+ "n02361337",
355
+ "n02363005",
356
+ "n02364673",
357
+ "n02389026",
358
+ "n02391049",
359
+ "n02395406",
360
+ "n02396427",
361
+ "n02397096",
362
+ "n02398521",
363
+ "n02403003",
364
+ "n02408429",
365
+ "n02410509",
366
+ "n02412080",
367
+ "n02415577",
368
+ "n02417914",
369
+ "n02422106",
370
+ "n02422699",
371
+ "n02423022",
372
+ "n02437312",
373
+ "n02437616",
374
+ "n02441942",
375
+ "n02442845",
376
+ "n02443114",
377
+ "n02443484",
378
+ "n02444819",
379
+ "n02445715",
380
+ "n02447366",
381
+ "n02454379",
382
+ "n02457408",
383
+ "n02480495",
384
+ "n02480855",
385
+ "n02481823",
386
+ "n02483362",
387
+ "n02483708",
388
+ "n02484975",
389
+ "n02486261",
390
+ "n02486410",
391
+ "n02487347",
392
+ "n02488291",
393
+ "n02488702",
394
+ "n02489166",
395
+ "n02490219",
396
+ "n02492035",
397
+ "n02492660",
398
+ "n02493509",
399
+ "n02493793",
400
+ "n02494079",
401
+ "n02497673",
402
+ "n02500267",
403
+ "n02504013",
404
+ "n02504458",
405
+ "n02509815",
406
+ "n02510455",
407
+ "n02514041",
408
+ "n02526121",
409
+ "n02536864",
410
+ "n02606052",
411
+ "n02607072",
412
+ "n02640242",
413
+ "n02641379",
414
+ "n02643566",
415
+ "n02655020",
416
+ "n02666196",
417
+ "n02667093",
418
+ "n02669723",
419
+ "n02672831",
420
+ "n02676566",
421
+ "n02687172",
422
+ "n02690373",
423
+ "n02692877",
424
+ "n02699494",
425
+ "n02701002",
426
+ "n02704792",
427
+ "n02708093",
428
+ "n02727426",
429
+ "n02730930",
430
+ "n02747177",
431
+ "n02749479",
432
+ "n02769748",
433
+ "n02776631",
434
+ "n02777292",
435
+ "n02782093",
436
+ "n02783161",
437
+ "n02786058",
438
+ "n02787622",
439
+ "n02788148",
440
+ "n02790996",
441
+ "n02791124",
442
+ "n02791270",
443
+ "n02793495",
444
+ "n02794156",
445
+ "n02795169",
446
+ "n02797295",
447
+ "n02799071",
448
+ "n02802426",
449
+ "n02804414",
450
+ "n02804610",
451
+ "n02807133",
452
+ "n02808304",
453
+ "n02808440",
454
+ "n02814533",
455
+ "n02814860",
456
+ "n02815834",
457
+ "n02817516",
458
+ "n02823428",
459
+ "n02823750",
460
+ "n02825657",
461
+ "n02834397",
462
+ "n02835271",
463
+ "n02837789",
464
+ "n02840245",
465
+ "n02841315",
466
+ "n02843684",
467
+ "n02859443",
468
+ "n02860847",
469
+ "n02865351",
470
+ "n02869837",
471
+ "n02870880",
472
+ "n02871525",
473
+ "n02877765",
474
+ "n02879718",
475
+ "n02883205",
476
+ "n02892201",
477
+ "n02892767",
478
+ "n02894605",
479
+ "n02895154",
480
+ "n02906734",
481
+ "n02909870",
482
+ "n02910353",
483
+ "n02916936",
484
+ "n02917067",
485
+ "n02927161",
486
+ "n02930766",
487
+ "n02939185",
488
+ "n02948072",
489
+ "n02950826",
490
+ "n02951358",
491
+ "n02951585",
492
+ "n02963159",
493
+ "n02965783",
494
+ "n02966193",
495
+ "n02966687",
496
+ "n02971356",
497
+ "n02974003",
498
+ "n02977058",
499
+ "n02978881",
500
+ "n02979186",
501
+ "n02980441",
502
+ "n02981792",
503
+ "n02988304",
504
+ "n02992211",
505
+ "n02992529",
506
+ "n02999410",
507
+ "n03000134",
508
+ "n03000247",
509
+ "n03000684",
510
+ "n03014705",
511
+ "n03016953",
512
+ "n03017168",
513
+ "n03018349",
514
+ "n03026506",
515
+ "n03028079",
516
+ "n03032252",
517
+ "n03041632",
518
+ "n03042490",
519
+ "n03045698",
520
+ "n03047690",
521
+ "n03062245",
522
+ "n03063599",
523
+ "n03063689",
524
+ "n03065424",
525
+ "n03075370",
526
+ "n03085013",
527
+ "n03089624",
528
+ "n03095699",
529
+ "n03100240",
530
+ "n03109150",
531
+ "n03110669",
532
+ "n03124043",
533
+ "n03124170",
534
+ "n03125729",
535
+ "n03126707",
536
+ "n03127747",
537
+ "n03127925",
538
+ "n03131574",
539
+ "n03133878",
540
+ "n03134739",
541
+ "n03141823",
542
+ "n03146219",
543
+ "n03160309",
544
+ "n03179701",
545
+ "n03180011",
546
+ "n03187595",
547
+ "n03188531",
548
+ "n03196217",
549
+ "n03197337",
550
+ "n03201208",
551
+ "n03207743",
552
+ "n03207941",
553
+ "n03208938",
554
+ "n03216828",
555
+ "n03218198",
556
+ "n03220513",
557
+ "n03223299",
558
+ "n03240683",
559
+ "n03249569",
560
+ "n03250847",
561
+ "n03255030",
562
+ "n03259280",
563
+ "n03271574",
564
+ "n03272010",
565
+ "n03272562",
566
+ "n03290653",
567
+ "n03291819",
568
+ "n03297495",
569
+ "n03314780",
570
+ "n03325584",
571
+ "n03337140",
572
+ "n03344393",
573
+ "n03345487",
574
+ "n03347037",
575
+ "n03355925",
576
+ "n03372029",
577
+ "n03376595",
578
+ "n03379051",
579
+ "n03384352",
580
+ "n03388043",
581
+ "n03388183",
582
+ "n03388549",
583
+ "n03393912",
584
+ "n03394916",
585
+ "n03400231",
586
+ "n03404251",
587
+ "n03417042",
588
+ "n03424325",
589
+ "n03425413",
590
+ "n03443371",
591
+ "n03444034",
592
+ "n03445777",
593
+ "n03445924",
594
+ "n03447447",
595
+ "n03447721",
596
+ "n03450230",
597
+ "n03452741",
598
+ "n03457902",
599
+ "n03459775",
600
+ "n03461385",
601
+ "n03467068",
602
+ "n03476684",
603
+ "n03476991",
604
+ "n03478589",
605
+ "n03481172",
606
+ "n03482405",
607
+ "n03483316",
608
+ "n03485407",
609
+ "n03485794",
610
+ "n03492542",
611
+ "n03494278",
612
+ "n03495258",
613
+ "n03496892",
614
+ "n03498962",
615
+ "n03527444",
616
+ "n03529860",
617
+ "n03530642",
618
+ "n03532672",
619
+ "n03534580",
620
+ "n03535780",
621
+ "n03538406",
622
+ "n03544143",
623
+ "n03584254",
624
+ "n03584829",
625
+ "n03590841",
626
+ "n03594734",
627
+ "n03594945",
628
+ "n03595614",
629
+ "n03598930",
630
+ "n03599486",
631
+ "n03602883",
632
+ "n03617480",
633
+ "n03623198",
634
+ "n03627232",
635
+ "n03630383",
636
+ "n03633091",
637
+ "n03637318",
638
+ "n03642806",
639
+ "n03649909",
640
+ "n03657121",
641
+ "n03658185",
642
+ "n03661043",
643
+ "n03662601",
644
+ "n03666591",
645
+ "n03670208",
646
+ "n03673027",
647
+ "n03676483",
648
+ "n03680355",
649
+ "n03690938",
650
+ "n03691459",
651
+ "n03692522",
652
+ "n03697007",
653
+ "n03706229",
654
+ "n03709823",
655
+ "n03710193",
656
+ "n03710637",
657
+ "n03710721",
658
+ "n03717622",
659
+ "n03720891",
660
+ "n03721384",
661
+ "n03724870",
662
+ "n03729826",
663
+ "n03733131",
664
+ "n03733281",
665
+ "n03733805",
666
+ "n03742115",
667
+ "n03743016",
668
+ "n03759954",
669
+ "n03761084",
670
+ "n03763968",
671
+ "n03764736",
672
+ "n03769881",
673
+ "n03770439",
674
+ "n03770679",
675
+ "n03773504",
676
+ "n03775071",
677
+ "n03775546",
678
+ "n03776460",
679
+ "n03777568",
680
+ "n03777754",
681
+ "n03781244",
682
+ "n03782006",
683
+ "n03785016",
684
+ "n03786901",
685
+ "n03787032",
686
+ "n03788195",
687
+ "n03788365",
688
+ "n03791053",
689
+ "n03792782",
690
+ "n03792972",
691
+ "n03793489",
692
+ "n03794056",
693
+ "n03796401",
694
+ "n03803284",
695
+ "n03804744",
696
+ "n03814639",
697
+ "n03814906",
698
+ "n03825788",
699
+ "n03832673",
700
+ "n03837869",
701
+ "n03838899",
702
+ "n03840681",
703
+ "n03841143",
704
+ "n03843555",
705
+ "n03854065",
706
+ "n03857828",
707
+ "n03866082",
708
+ "n03868242",
709
+ "n03868863",
710
+ "n03871628",
711
+ "n03873416",
712
+ "n03874293",
713
+ "n03874599",
714
+ "n03876231",
715
+ "n03877472",
716
+ "n03877845",
717
+ "n03884397",
718
+ "n03887697",
719
+ "n03888257",
720
+ "n03888605",
721
+ "n03891251",
722
+ "n03891332",
723
+ "n03895866",
724
+ "n03899768",
725
+ "n03902125",
726
+ "n03903868",
727
+ "n03908618",
728
+ "n03908714",
729
+ "n03916031",
730
+ "n03920288",
731
+ "n03924679",
732
+ "n03929660",
733
+ "n03929855",
734
+ "n03930313",
735
+ "n03930630",
736
+ "n03933933",
737
+ "n03935335",
738
+ "n03937543",
739
+ "n03938244",
740
+ "n03942813",
741
+ "n03944341",
742
+ "n03947888",
743
+ "n03950228",
744
+ "n03954731",
745
+ "n03956157",
746
+ "n03958227",
747
+ "n03961711",
748
+ "n03967562",
749
+ "n03970156",
750
+ "n03976467",
751
+ "n03976657",
752
+ "n03977966",
753
+ "n03980874",
754
+ "n03982430",
755
+ "n03983396",
756
+ "n03991062",
757
+ "n03992509",
758
+ "n03995372",
759
+ "n03998194",
760
+ "n04004767",
761
+ "n04005630",
762
+ "n04008634",
763
+ "n04009552",
764
+ "n04019541",
765
+ "n04023962",
766
+ "n04026417",
767
+ "n04033901",
768
+ "n04033995",
769
+ "n04037443",
770
+ "n04039381",
771
+ "n04040759",
772
+ "n04041544",
773
+ "n04044716",
774
+ "n04049303",
775
+ "n04065272",
776
+ "n04067472",
777
+ "n04069434",
778
+ "n04070727",
779
+ "n04074963",
780
+ "n04081281",
781
+ "n04086273",
782
+ "n04090263",
783
+ "n04099969",
784
+ "n04111531",
785
+ "n04116512",
786
+ "n04118538",
787
+ "n04118776",
788
+ "n04120489",
789
+ "n04125021",
790
+ "n04127249",
791
+ "n04131690",
792
+ "n04133789",
793
+ "n04136333",
794
+ "n04141076",
795
+ "n04141327",
796
+ "n04141975",
797
+ "n04146614",
798
+ "n04147183",
799
+ "n04149813",
800
+ "n04152593",
801
+ "n04153751",
802
+ "n04154565",
803
+ "n04162706",
804
+ "n04179913",
805
+ "n04192698",
806
+ "n04200800",
807
+ "n04201297",
808
+ "n04204238",
809
+ "n04204347",
810
+ "n04208210",
811
+ "n04209133",
812
+ "n04209239",
813
+ "n04228054",
814
+ "n04229816",
815
+ "n04235860",
816
+ "n04238763",
817
+ "n04239074",
818
+ "n04243546",
819
+ "n04251144",
820
+ "n04252077",
821
+ "n04252225",
822
+ "n04254120",
823
+ "n04254680",
824
+ "n04254777",
825
+ "n04258138",
826
+ "n04259630",
827
+ "n04263257",
828
+ "n04264628",
829
+ "n04265275",
830
+ "n04266014",
831
+ "n04270147",
832
+ "n04273569",
833
+ "n04275548",
834
+ "n04277352",
835
+ "n04285008",
836
+ "n04286575",
837
+ "n04296562",
838
+ "n04310018",
839
+ "n04311004",
840
+ "n04311174",
841
+ "n04317175",
842
+ "n04325704",
843
+ "n04326547",
844
+ "n04328186",
845
+ "n04330267",
846
+ "n04332243",
847
+ "n04335435",
848
+ "n04336792",
849
+ "n04344873",
850
+ "n04346328",
851
+ "n04347754",
852
+ "n04350905",
853
+ "n04355338",
854
+ "n04355933",
855
+ "n04356056",
856
+ "n04357314",
857
+ "n04366367",
858
+ "n04367480",
859
+ "n04370456",
860
+ "n04371430",
861
+ "n04371774",
862
+ "n04372370",
863
+ "n04376876",
864
+ "n04380533",
865
+ "n04389033",
866
+ "n04392985",
867
+ "n04398044",
868
+ "n04399382",
869
+ "n04404412",
870
+ "n04409515",
871
+ "n04417672",
872
+ "n04418357",
873
+ "n04423845",
874
+ "n04428191",
875
+ "n04429376",
876
+ "n04435653",
877
+ "n04442312",
878
+ "n04443257",
879
+ "n04447861",
880
+ "n04456115",
881
+ "n04458633",
882
+ "n04461696",
883
+ "n04462240",
884
+ "n04465501",
885
+ "n04467665",
886
+ "n04476259",
887
+ "n04479046",
888
+ "n04482393",
889
+ "n04483307",
890
+ "n04485082",
891
+ "n04486054",
892
+ "n04487081",
893
+ "n04487394",
894
+ "n04493381",
895
+ "n04501370",
896
+ "n04505470",
897
+ "n04507155",
898
+ "n04509417",
899
+ "n04515003",
900
+ "n04517823",
901
+ "n04522168",
902
+ "n04523525",
903
+ "n04525038",
904
+ "n04525305",
905
+ "n04532106",
906
+ "n04532670",
907
+ "n04536866",
908
+ "n04540053",
909
+ "n04542943",
910
+ "n04548280",
911
+ "n04548362",
912
+ "n04550184",
913
+ "n04552348",
914
+ "n04553703",
915
+ "n04554684",
916
+ "n04557648",
917
+ "n04560804",
918
+ "n04562935",
919
+ "n04579145",
920
+ "n04579432",
921
+ "n04584207",
922
+ "n04589890",
923
+ "n04590129",
924
+ "n04591157",
925
+ "n04591713",
926
+ "n04592741",
927
+ "n04596742",
928
+ "n04597913",
929
+ "n04599235",
930
+ "n04604644",
931
+ "n04606251",
932
+ "n04612504",
933
+ "n04613696",
934
+ "n06359193",
935
+ "n06596364",
936
+ "n06785654",
937
+ "n06794110",
938
+ "n06874185",
939
+ "n07248320",
940
+ "n07565083",
941
+ "n07579787",
942
+ "n07583066",
943
+ "n07584110",
944
+ "n07590611",
945
+ "n07613480",
946
+ "n07614500",
947
+ "n07615774",
948
+ "n07684084",
949
+ "n07693725",
950
+ "n07695742",
951
+ "n07697313",
952
+ "n07697537",
953
+ "n07711569",
954
+ "n07714571",
955
+ "n07714990",
956
+ "n07715103",
957
+ "n07716358",
958
+ "n07716906",
959
+ "n07717410",
960
+ "n07717556",
961
+ "n07718472",
962
+ "n07718747",
963
+ "n07720875",
964
+ "n07730033",
965
+ "n07734744",
966
+ "n07742313",
967
+ "n07745940",
968
+ "n07747607",
969
+ "n07749582",
970
+ "n07753113",
971
+ "n07753275",
972
+ "n07753592",
973
+ "n07754684",
974
+ "n07760859",
975
+ "n07768694",
976
+ "n07802026",
977
+ "n07831146",
978
+ "n07836838",
979
+ "n07860988",
980
+ "n07871810",
981
+ "n07873807",
982
+ "n07875152",
983
+ "n07880968",
984
+ "n07892512",
985
+ "n07920052",
986
+ "n07930864",
987
+ "n07932039",
988
+ "n09193705",
989
+ "n09229709",
990
+ "n09246464",
991
+ "n09256479",
992
+ "n09288635",
993
+ "n09332890",
994
+ "n09399592",
995
+ "n09421951",
996
+ "n09428293",
997
+ "n09468604",
998
+ "n09472597",
999
+ "n09835506",
1000
+ "n10148035",
1001
+ "n10565667",
1002
+ "n11879895",
1003
+ "n11939491",
1004
+ "n12057211",
1005
+ "n12144580",
1006
+ "n12267677",
1007
+ "n12620546",
1008
+ "n12768682",
1009
+ "n12985857",
1010
+ "n12998815",
1011
+ "n13037406",
1012
+ "n13040303",
1013
+ "n13044778",
1014
+ "n13052670",
1015
+ "n13054560",
1016
+ "n13133613",
1017
+ "n15075141"
1018
+ ],
1019
+ "class_names": [
1020
+ "tench, Tinca tinca",
1021
+ "goldfish, Carassius auratus",
1022
+ "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
1023
+ "tiger shark, Galeocerdo cuvieri",
1024
+ "hammerhead, hammerhead shark",
1025
+ "electric ray, crampfish, numbfish, torpedo",
1026
+ "stingray",
1027
+ "cock",
1028
+ "hen",
1029
+ "ostrich, Struthio camelus",
1030
+ "brambling, Fringilla montifringilla",
1031
+ "goldfinch, Carduelis carduelis",
1032
+ "house finch, linnet, Carpodacus mexicanus",
1033
+ "junco, snowbird",
1034
+ "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
1035
+ "robin, American robin, Turdus migratorius",
1036
+ "bulbul",
1037
+ "jay",
1038
+ "magpie",
1039
+ "chickadee",
1040
+ "water ouzel, dipper",
1041
+ "kite",
1042
+ "bald eagle, American eagle, Haliaeetus leucocephalus",
1043
+ "vulture",
1044
+ "great grey owl, great gray owl, Strix nebulosa",
1045
+ "European fire salamander, Salamandra salamandra",
1046
+ "common newt, Triturus vulgaris",
1047
+ "eft",
1048
+ "spotted salamander, Ambystoma maculatum",
1049
+ "axolotl, mud puppy, Ambystoma mexicanum",
1050
+ "bullfrog, Rana catesbeiana",
1051
+ "tree frog, tree-frog",
1052
+ "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
1053
+ "loggerhead, loggerhead turtle, Caretta caretta",
1054
+ "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",
1055
+ "mud turtle",
1056
+ "terrapin",
1057
+ "box turtle, box tortoise",
1058
+ "banded gecko",
1059
+ "common iguana, iguana, Iguana iguana",
1060
+ "American chameleon, anole, Anolis carolinensis",
1061
+ "whiptail, whiptail lizard",
1062
+ "agama",
1063
+ "frilled lizard, Chlamydosaurus kingi",
1064
+ "alligator lizard",
1065
+ "Gila monster, Heloderma suspectum",
1066
+ "green lizard, Lacerta viridis",
1067
+ "African chameleon, Chamaeleo chamaeleon",
1068
+ "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis",
1069
+ "African crocodile, Nile crocodile, Crocodylus niloticus",
1070
+ "American alligator, Alligator mississipiensis",
1071
+ "triceratops",
1072
+ "thunder snake, worm snake, Carphophis amoenus",
1073
+ "ringneck snake, ring-necked snake, ring snake",
1074
+ "hognose snake, puff adder, sand viper",
1075
+ "green snake, grass snake",
1076
+ "king snake, kingsnake",
1077
+ "garter snake, grass snake",
1078
+ "water snake",
1079
+ "vine snake",
1080
+ "night snake, Hypsiglena torquata",
1081
+ "boa constrictor, Constrictor constrictor",
1082
+ "rock python, rock snake, Python sebae",
1083
+ "Indian cobra, Naja naja",
1084
+ "green mamba",
1085
+ "sea snake",
1086
+ "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
1087
+ "diamondback, diamondback rattlesnake, Crotalus adamanteus",
1088
+ "sidewinder, horned rattlesnake, Crotalus cerastes",
1089
+ "trilobite",
1090
+ "harvestman, daddy longlegs, Phalangium opilio",
1091
+ "scorpion",
1092
+ "black and gold garden spider, Argiope aurantia",
1093
+ "barn spider, Araneus cavaticus",
1094
+ "garden spider, Aranea diademata",
1095
+ "black widow, Latrodectus mactans",
1096
+ "tarantula",
1097
+ "wolf spider, hunting spider",
1098
+ "tick",
1099
+ "centipede",
1100
+ "black grouse",
1101
+ "ptarmigan",
1102
+ "ruffed grouse, partridge, Bonasa umbellus",
1103
+ "prairie chicken, prairie grouse, prairie fowl",
1104
+ "peacock",
1105
+ "quail",
1106
+ "partridge",
1107
+ "African grey, African gray, Psittacus erithacus",
1108
+ "macaw",
1109
+ "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
1110
+ "lorikeet",
1111
+ "coucal",
1112
+ "bee eater",
1113
+ "hornbill",
1114
+ "hummingbird",
1115
+ "jacamar",
1116
+ "toucan",
1117
+ "drake",
1118
+ "red-breasted merganser, Mergus serrator",
1119
+ "goose",
1120
+ "black swan, Cygnus atratus",
1121
+ "tusker",
1122
+ "echidna, spiny anteater, anteater",
1123
+ "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus",
1124
+ "wallaby, brush kangaroo",
1125
+ "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",
1126
+ "wombat",
1127
+ "jellyfish",
1128
+ "sea anemone, anemone",
1129
+ "brain coral",
1130
+ "flatworm, platyhelminth",
1131
+ "nematode, nematode worm, roundworm",
1132
+ "conch",
1133
+ "snail",
1134
+ "slug",
1135
+ "sea slug, nudibranch",
1136
+ "chiton, coat-of-mail shell, sea cradle, polyplacophore",
1137
+ "chambered nautilus, pearly nautilus, nautilus",
1138
+ "Dungeness crab, Cancer magister",
1139
+ "rock crab, Cancer irroratus",
1140
+ "fiddler crab",
1141
+ "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica",
1142
+ "American lobster, Northern lobster, Maine lobster, Homarus americanus",
1143
+ "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",
1144
+ "crayfish, crawfish, crawdad, crawdaddy",
1145
+ "hermit crab",
1146
+ "isopod",
1147
+ "white stork, Ciconia ciconia",
1148
+ "black stork, Ciconia nigra",
1149
+ "spoonbill",
1150
+ "flamingo",
1151
+ "little blue heron, Egretta caerulea",
1152
+ "American egret, great white heron, Egretta albus",
1153
+ "bittern",
1154
+ "crane",
1155
+ "limpkin, Aramus pictus",
1156
+ "European gallinule, Porphyrio porphyrio",
1157
+ "American coot, marsh hen, mud hen, water hen, Fulica americana",
1158
+ "bustard",
1159
+ "ruddy turnstone, Arenaria interpres",
1160
+ "red-backed sandpiper, dunlin, Erolia alpina",
1161
+ "redshank, Tringa totanus",
1162
+ "dowitcher",
1163
+ "oystercatcher, oyster catcher",
1164
+ "pelican",
1165
+ "king penguin, Aptenodytes patagonica",
1166
+ "albatross, mollymawk",
1167
+ "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus",
1168
+ "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
1169
+ "dugong, Dugong dugon",
1170
+ "sea lion",
1171
+ "Chihuahua",
1172
+ "Japanese spaniel",
1173
+ "Maltese dog, Maltese terrier, Maltese",
1174
+ "Pekinese, Pekingese, Peke",
1175
+ "Shih-Tzu",
1176
+ "Blenheim spaniel",
1177
+ "papillon",
1178
+ "toy terrier",
1179
+ "Rhodesian ridgeback",
1180
+ "Afghan hound, Afghan",
1181
+ "basset, basset hound",
1182
+ "beagle",
1183
+ "bloodhound, sleuthhound",
1184
+ "bluetick",
1185
+ "black-and-tan coonhound",
1186
+ "Walker hound, Walker foxhound",
1187
+ "English foxhound",
1188
+ "redbone",
1189
+ "borzoi, Russian wolfhound",
1190
+ "Irish wolfhound",
1191
+ "Italian greyhound",
1192
+ "whippet",
1193
+ "Ibizan hound, Ibizan Podenco",
1194
+ "Norwegian elkhound, elkhound",
1195
+ "otterhound, otter hound",
1196
+ "Saluki, gazelle hound",
1197
+ "Scottish deerhound, deerhound",
1198
+ "Weimaraner",
1199
+ "Staffordshire bullterrier, Staffordshire bull terrier",
1200
+ "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier",
1201
+ "Bedlington terrier",
1202
+ "Border terrier",
1203
+ "Kerry blue terrier",
1204
+ "Irish terrier",
1205
+ "Norfolk terrier",
1206
+ "Norwich terrier",
1207
+ "Yorkshire terrier",
1208
+ "wire-haired fox terrier",
1209
+ "Lakeland terrier",
1210
+ "Sealyham terrier, Sealyham",
1211
+ "Airedale, Airedale terrier",
1212
+ "cairn, cairn terrier",
1213
+ "Australian terrier",
1214
+ "Dandie Dinmont, Dandie Dinmont terrier",
1215
+ "Boston bull, Boston terrier",
1216
+ "miniature schnauzer",
1217
+ "giant schnauzer",
1218
+ "standard schnauzer",
1219
+ "Scotch terrier, Scottish terrier, Scottie",
1220
+ "Tibetan terrier, chrysanthemum dog",
1221
+ "silky terrier, Sydney silky",
1222
+ "soft-coated wheaten terrier",
1223
+ "West Highland white terrier",
1224
+ "Lhasa, Lhasa apso",
1225
+ "flat-coated retriever",
1226
+ "curly-coated retriever",
1227
+ "golden retriever",
1228
+ "Labrador retriever",
1229
+ "Chesapeake Bay retriever",
1230
+ "German short-haired pointer",
1231
+ "vizsla, Hungarian pointer",
1232
+ "English setter",
1233
+ "Irish setter, red setter",
1234
+ "Gordon setter",
1235
+ "Brittany spaniel",
1236
+ "clumber, clumber spaniel",
1237
+ "English springer, English springer spaniel",
1238
+ "Welsh springer spaniel",
1239
+ "cocker spaniel, English cocker spaniel, cocker",
1240
+ "Sussex spaniel",
1241
+ "Irish water spaniel",
1242
+ "kuvasz",
1243
+ "schipperke",
1244
+ "groenendael",
1245
+ "malinois",
1246
+ "briard",
1247
+ "kelpie",
1248
+ "komondor",
1249
+ "Old English sheepdog, bobtail",
1250
+ "Shetland sheepdog, Shetland sheep dog, Shetland",
1251
+ "collie",
1252
+ "Border collie",
1253
+ "Bouvier des Flandres, Bouviers des Flandres",
1254
+ "Rottweiler",
1255
+ "German shepherd, German shepherd dog, German police dog, alsatian",
1256
+ "Doberman, Doberman pinscher",
1257
+ "miniature pinscher",
1258
+ "Greater Swiss Mountain dog",
1259
+ "Bernese mountain dog",
1260
+ "Appenzeller",
1261
+ "EntleBucher",
1262
+ "boxer",
1263
+ "bull mastiff",
1264
+ "Tibetan mastiff",
1265
+ "French bulldog",
1266
+ "Great Dane",
1267
+ "Saint Bernard, St Bernard",
1268
+ "Eskimo dog, husky",
1269
+ "malamute, malemute, Alaskan malamute",
1270
+ "Siberian husky",
1271
+ "dalmatian, coach dog, carriage dog",
1272
+ "affenpinscher, monkey pinscher, monkey dog",
1273
+ "basenji",
1274
+ "pug, pug-dog",
1275
+ "Leonberg",
1276
+ "Newfoundland, Newfoundland dog",
1277
+ "Great Pyrenees",
1278
+ "Samoyed, Samoyede",
1279
+ "Pomeranian",
1280
+ "chow, chow chow",
1281
+ "keeshond",
1282
+ "Brabancon griffon",
1283
+ "Pembroke, Pembroke Welsh corgi",
1284
+ "Cardigan, Cardigan Welsh corgi",
1285
+ "toy poodle",
1286
+ "miniature poodle",
1287
+ "standard poodle",
1288
+ "Mexican hairless",
1289
+ "timber wolf, grey wolf, gray wolf, Canis lupus",
1290
+ "white wolf, Arctic wolf, Canis lupus tundrarum",
1291
+ "red wolf, maned wolf, Canis rufus, Canis niger",
1292
+ "coyote, prairie wolf, brush wolf, Canis latrans",
1293
+ "dingo, warrigal, warragal, Canis dingo",
1294
+ "dhole, Cuon alpinus",
1295
+ "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
1296
+ "hyena, hyaena",
1297
+ "red fox, Vulpes vulpes",
1298
+ "kit fox, Vulpes macrotis",
1299
+ "Arctic fox, white fox, Alopex lagopus",
1300
+ "grey fox, gray fox, Urocyon cinereoargenteus",
1301
+ "tabby, tabby cat",
1302
+ "tiger cat",
1303
+ "Persian cat",
1304
+ "Siamese cat, Siamese",
1305
+ "Egyptian cat",
1306
+ "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",
1307
+ "lynx, catamount",
1308
+ "leopard, Panthera pardus",
1309
+ "snow leopard, ounce, Panthera uncia",
1310
+ "jaguar, panther, Panthera onca, Felis onca",
1311
+ "lion, king of beasts, Panthera leo",
1312
+ "tiger, Panthera tigris",
1313
+ "cheetah, chetah, Acinonyx jubatus",
1314
+ "brown bear, bruin, Ursus arctos",
1315
+ "American black bear, black bear, Ursus americanus, Euarctos americanus",
1316
+ "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
1317
+ "sloth bear, Melursus ursinus, Ursus ursinus",
1318
+ "mongoose",
1319
+ "meerkat, mierkat",
1320
+ "tiger beetle",
1321
+ "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
1322
+ "ground beetle, carabid beetle",
1323
+ "long-horned beetle, longicorn, longicorn beetle",
1324
+ "leaf beetle, chrysomelid",
1325
+ "dung beetle",
1326
+ "rhinoceros beetle",
1327
+ "weevil",
1328
+ "fly",
1329
+ "bee",
1330
+ "ant, emmet, pismire",
1331
+ "grasshopper, hopper",
1332
+ "cricket",
1333
+ "walking stick, walkingstick, stick insect",
1334
+ "cockroach, roach",
1335
+ "mantis, mantid",
1336
+ "cicada, cicala",
1337
+ "leafhopper",
1338
+ "lacewing, lacewing fly",
1339
+ "dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
1340
+ "damselfly",
1341
+ "admiral",
1342
+ "ringlet, ringlet butterfly",
1343
+ "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
1344
+ "cabbage butterfly",
1345
+ "sulphur butterfly, sulfur butterfly",
1346
+ "lycaenid, lycaenid butterfly",
1347
+ "starfish, sea star",
1348
+ "sea urchin",
1349
+ "sea cucumber, holothurian",
1350
+ "wood rabbit, cottontail, cottontail rabbit",
1351
+ "hare",
1352
+ "Angora, Angora rabbit",
1353
+ "hamster",
1354
+ "porcupine, hedgehog",
1355
+ "fox squirrel, eastern fox squirrel, Sciurus niger",
1356
+ "marmot",
1357
+ "beaver",
1358
+ "guinea pig, Cavia cobaya",
1359
+ "sorrel",
1360
+ "zebra",
1361
+ "hog, pig, grunter, squealer, Sus scrofa",
1362
+ "wild boar, boar, Sus scrofa",
1363
+ "warthog",
1364
+ "hippopotamus, hippo, river horse, Hippopotamus amphibius",
1365
+ "ox",
1366
+ "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
1367
+ "bison",
1368
+ "ram, tup",
1369
+ "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis",
1370
+ "ibex, Capra ibex",
1371
+ "hartebeest",
1372
+ "impala, Aepyceros melampus",
1373
+ "gazelle",
1374
+ "Arabian camel, dromedary, Camelus dromedarius",
1375
+ "llama",
1376
+ "weasel",
1377
+ "mink",
1378
+ "polecat, fitch, foulmart, foumart, Mustela putorius",
1379
+ "black-footed ferret, ferret, Mustela nigripes",
1380
+ "otter",
1381
+ "skunk, polecat, wood pussy",
1382
+ "badger",
1383
+ "armadillo",
1384
+ "three-toed sloth, ai, Bradypus tridactylus",
1385
+ "orangutan, orang, orangutang, Pongo pygmaeus",
1386
+ "gorilla, Gorilla gorilla",
1387
+ "chimpanzee, chimp, Pan troglodytes",
1388
+ "gibbon, Hylobates lar",
1389
+ "siamang, Hylobates syndactylus, Symphalangus syndactylus",
1390
+ "guenon, guenon monkey",
1391
+ "patas, hussar monkey, Erythrocebus patas",
1392
+ "baboon",
1393
+ "macaque",
1394
+ "langur",
1395
+ "colobus, colobus monkey",
1396
+ "proboscis monkey, Nasalis larvatus",
1397
+ "marmoset",
1398
+ "capuchin, ringtail, Cebus capucinus",
1399
+ "howler monkey, howler",
1400
+ "titi, titi monkey",
1401
+ "spider monkey, Ateles geoffroyi",
1402
+ "squirrel monkey, Saimiri sciureus",
1403
+ "Madagascar cat, ring-tailed lemur, Lemur catta",
1404
+ "indri, indris, Indri indri, Indri brevicaudatus",
1405
+ "Indian elephant, Elephas maximus",
1406
+ "African elephant, Loxodonta africana",
1407
+ "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
1408
+ "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
1409
+ "barracouta, snoek",
1410
+ "eel",
1411
+ "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",
1412
+ "rock beauty, Holocanthus tricolor",
1413
+ "anemone fish",
1414
+ "sturgeon",
1415
+ "gar, garfish, garpike, billfish, Lepisosteus osseus",
1416
+ "lionfish",
1417
+ "puffer, pufferfish, blowfish, globefish",
1418
+ "abacus",
1419
+ "abaya",
1420
+ "academic gown, academic robe, judge's robe",
1421
+ "accordion, piano accordion, squeeze box",
1422
+ "acoustic guitar",
1423
+ "aircraft carrier, carrier, flattop, attack aircraft carrier",
1424
+ "airliner",
1425
+ "airship, dirigible",
1426
+ "altar",
1427
+ "ambulance",
1428
+ "amphibian, amphibious vehicle",
1429
+ "analog clock",
1430
+ "apiary, bee house",
1431
+ "apron",
1432
+ "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
1433
+ "assault rifle, assault gun",
1434
+ "backpack, back pack, knapsack, packsack, rucksack, haversack",
1435
+ "bakery, bakeshop, bakehouse",
1436
+ "balance beam, beam",
1437
+ "balloon",
1438
+ "ballpoint, ballpoint pen, ballpen, Biro",
1439
+ "Band Aid",
1440
+ "banjo",
1441
+ "bannister, banister, balustrade, balusters, handrail",
1442
+ "barbell",
1443
+ "barber chair",
1444
+ "barbershop",
1445
+ "barn",
1446
+ "barometer",
1447
+ "barrel, cask",
1448
+ "barrow, garden cart, lawn cart, wheelbarrow",
1449
+ "baseball",
1450
+ "basketball",
1451
+ "bassinet",
1452
+ "bassoon",
1453
+ "bathing cap, swimming cap",
1454
+ "bath towel",
1455
+ "bathtub, bathing tub, bath, tub",
1456
+ "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon",
1457
+ "beacon, lighthouse, beacon light, pharos",
1458
+ "beaker",
1459
+ "bearskin, busby, shako",
1460
+ "beer bottle",
1461
+ "beer glass",
1462
+ "bell cote, bell cot",
1463
+ "bib",
1464
+ "bicycle-built-for-two, tandem bicycle, tandem",
1465
+ "bikini, two-piece",
1466
+ "binder, ring-binder",
1467
+ "binoculars, field glasses, opera glasses",
1468
+ "birdhouse",
1469
+ "boathouse",
1470
+ "bobsled, bobsleigh, bob",
1471
+ "bolo tie, bolo, bola tie, bola",
1472
+ "bonnet, poke bonnet",
1473
+ "bookcase",
1474
+ "bookshop, bookstore, bookstall",
1475
+ "bottlecap",
1476
+ "bow",
1477
+ "bow tie, bow-tie, bowtie",
1478
+ "brass, memorial tablet, plaque",
1479
+ "brassiere, bra, bandeau",
1480
+ "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
1481
+ "breastplate, aegis, egis",
1482
+ "broom",
1483
+ "bucket, pail",
1484
+ "buckle",
1485
+ "bulletproof vest",
1486
+ "bullet train, bullet",
1487
+ "butcher shop, meat market",
1488
+ "cab, hack, taxi, taxicab",
1489
+ "caldron, cauldron",
1490
+ "candle, taper, wax light",
1491
+ "cannon",
1492
+ "canoe",
1493
+ "can opener, tin opener",
1494
+ "cardigan",
1495
+ "car mirror",
1496
+ "carousel, carrousel, merry-go-round, roundabout, whirligig",
1497
+ "carpenter's kit, tool kit",
1498
+ "carton",
1499
+ "car wheel",
1500
+ "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM",
1501
+ "cassette",
1502
+ "cassette player",
1503
+ "castle",
1504
+ "catamaran",
1505
+ "CD player",
1506
+ "cello, violoncello",
1507
+ "cellular telephone, cellular phone, cellphone, cell, mobile phone",
1508
+ "chain",
1509
+ "chainlink fence",
1510
+ "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour",
1511
+ "chain saw, chainsaw",
1512
+ "chest",
1513
+ "chiffonier, commode",
1514
+ "chime, bell, gong",
1515
+ "china cabinet, china closet",
1516
+ "Christmas stocking",
1517
+ "church, church building",
1518
+ "cinema, movie theater, movie theatre, movie house, picture palace",
1519
+ "cleaver, meat cleaver, chopper",
1520
+ "cliff dwelling",
1521
+ "cloak",
1522
+ "clog, geta, patten, sabot",
1523
+ "cocktail shaker",
1524
+ "coffee mug",
1525
+ "coffeepot",
1526
+ "coil, spiral, volute, whorl, helix",
1527
+ "combination lock",
1528
+ "computer keyboard, keypad",
1529
+ "confectionery, confectionary, candy store",
1530
+ "container ship, containership, container vessel",
1531
+ "convertible",
1532
+ "corkscrew, bottle screw",
1533
+ "cornet, horn, trumpet, trump",
1534
+ "cowboy boot",
1535
+ "cowboy hat, ten-gallon hat",
1536
+ "cradle",
1537
+ "crane",
1538
+ "crash helmet",
1539
+ "crate",
1540
+ "crib, cot",
1541
+ "Crock Pot",
1542
+ "croquet ball",
1543
+ "crutch",
1544
+ "cuirass",
1545
+ "dam, dike, dyke",
1546
+ "desk",
1547
+ "desktop computer",
1548
+ "dial telephone, dial phone",
1549
+ "diaper, nappy, napkin",
1550
+ "digital clock",
1551
+ "digital watch",
1552
+ "dining table, board",
1553
+ "dishrag, dishcloth",
1554
+ "dishwasher, dish washer, dishwashing machine",
1555
+ "disk brake, disc brake",
1556
+ "dock, dockage, docking facility",
1557
+ "dogsled, dog sled, dog sleigh",
1558
+ "dome",
1559
+ "doormat, welcome mat",
1560
+ "drilling platform, offshore rig",
1561
+ "drum, membranophone, tympan",
1562
+ "drumstick",
1563
+ "dumbbell",
1564
+ "Dutch oven",
1565
+ "electric fan, blower",
1566
+ "electric guitar",
1567
+ "electric locomotive",
1568
+ "entertainment center",
1569
+ "envelope",
1570
+ "espresso maker",
1571
+ "face powder",
1572
+ "feather boa, boa",
1573
+ "file, file cabinet, filing cabinet",
1574
+ "fireboat",
1575
+ "fire engine, fire truck",
1576
+ "fire screen, fireguard",
1577
+ "flagpole, flagstaff",
1578
+ "flute, transverse flute",
1579
+ "folding chair",
1580
+ "football helmet",
1581
+ "forklift",
1582
+ "fountain",
1583
+ "fountain pen",
1584
+ "four-poster",
1585
+ "freight car",
1586
+ "French horn, horn",
1587
+ "frying pan, frypan, skillet",
1588
+ "fur coat",
1589
+ "garbage truck, dustcart",
1590
+ "gasmask, respirator, gas helmet",
1591
+ "gas pump, gasoline pump, petrol pump, island dispenser",
1592
+ "goblet",
1593
+ "go-kart",
1594
+ "golf ball",
1595
+ "golfcart, golf cart",
1596
+ "gondola",
1597
+ "gong, tam-tam",
1598
+ "gown",
1599
+ "grand piano, grand",
1600
+ "greenhouse, nursery, glasshouse",
1601
+ "grille, radiator grille",
1602
+ "grocery store, grocery, food market, market",
1603
+ "guillotine",
1604
+ "hair slide",
1605
+ "hair spray",
1606
+ "half track",
1607
+ "hammer",
1608
+ "hamper",
1609
+ "hand blower, blow dryer, blow drier, hair dryer, hair drier",
1610
+ "hand-held computer, hand-held microcomputer",
1611
+ "handkerchief, hankie, hanky, hankey",
1612
+ "hard disc, hard disk, fixed disk",
1613
+ "harmonica, mouth organ, harp, mouth harp",
1614
+ "harp",
1615
+ "harvester, reaper",
1616
+ "hatchet",
1617
+ "holster",
1618
+ "home theater, home theatre",
1619
+ "honeycomb",
1620
+ "hook, claw",
1621
+ "hoopskirt, crinoline",
1622
+ "horizontal bar, high bar",
1623
+ "horse cart, horse-cart",
1624
+ "hourglass",
1625
+ "iPod",
1626
+ "iron, smoothing iron",
1627
+ "jack-o'-lantern",
1628
+ "jean, blue jean, denim",
1629
+ "jeep, landrover",
1630
+ "jersey, T-shirt, tee shirt",
1631
+ "jigsaw puzzle",
1632
+ "jinrikisha, ricksha, rickshaw",
1633
+ "joystick",
1634
+ "kimono",
1635
+ "knee pad",
1636
+ "knot",
1637
+ "lab coat, laboratory coat",
1638
+ "ladle",
1639
+ "lampshade, lamp shade",
1640
+ "laptop, laptop computer",
1641
+ "lawn mower, mower",
1642
+ "lens cap, lens cover",
1643
+ "letter opener, paper knife, paperknife",
1644
+ "library",
1645
+ "lifeboat",
1646
+ "lighter, light, igniter, ignitor",
1647
+ "limousine, limo",
1648
+ "liner, ocean liner",
1649
+ "lipstick, lip rouge",
1650
+ "Loafer",
1651
+ "lotion",
1652
+ "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
1653
+ "loupe, jeweler's loupe",
1654
+ "lumbermill, sawmill",
1655
+ "magnetic compass",
1656
+ "mailbag, postbag",
1657
+ "mailbox, letter box",
1658
+ "maillot",
1659
+ "maillot, tank suit",
1660
+ "manhole cover",
1661
+ "maraca",
1662
+ "marimba, xylophone",
1663
+ "mask",
1664
+ "matchstick",
1665
+ "maypole",
1666
+ "maze, labyrinth",
1667
+ "measuring cup",
1668
+ "medicine chest, medicine cabinet",
1669
+ "megalith, megalithic structure",
1670
+ "microphone, mike",
1671
+ "microwave, microwave oven",
1672
+ "military uniform",
1673
+ "milk can",
1674
+ "minibus",
1675
+ "miniskirt, mini",
1676
+ "minivan",
1677
+ "missile",
1678
+ "mitten",
1679
+ "mixing bowl",
1680
+ "mobile home, manufactured home",
1681
+ "Model T",
1682
+ "modem",
1683
+ "monastery",
1684
+ "monitor",
1685
+ "moped",
1686
+ "mortar",
1687
+ "mortarboard",
1688
+ "mosque",
1689
+ "mosquito net",
1690
+ "motor scooter, scooter",
1691
+ "mountain bike, all-terrain bike, off-roader",
1692
+ "mountain tent",
1693
+ "mouse, computer mouse",
1694
+ "mousetrap",
1695
+ "moving van",
1696
+ "muzzle",
1697
+ "nail",
1698
+ "neck brace",
1699
+ "necklace",
1700
+ "nipple",
1701
+ "notebook, notebook computer",
1702
+ "obelisk",
1703
+ "oboe, hautboy, hautbois",
1704
+ "ocarina, sweet potato",
1705
+ "odometer, hodometer, mileometer, milometer",
1706
+ "oil filter",
1707
+ "organ, pipe organ",
1708
+ "oscilloscope, scope, cathode-ray oscilloscope, CRO",
1709
+ "overskirt",
1710
+ "oxcart",
1711
+ "oxygen mask",
1712
+ "packet",
1713
+ "paddle, boat paddle",
1714
+ "paddlewheel, paddle wheel",
1715
+ "padlock",
1716
+ "paintbrush",
1717
+ "pajama, pyjama, pj's, jammies",
1718
+ "palace",
1719
+ "panpipe, pandean pipe, syrinx",
1720
+ "paper towel",
1721
+ "parachute, chute",
1722
+ "parallel bars, bars",
1723
+ "park bench",
1724
+ "parking meter",
1725
+ "passenger car, coach, carriage",
1726
+ "patio, terrace",
1727
+ "pay-phone, pay-station",
1728
+ "pedestal, plinth, footstall",
1729
+ "pencil box, pencil case",
1730
+ "pencil sharpener",
1731
+ "perfume, essence",
1732
+ "Petri dish",
1733
+ "photocopier",
1734
+ "pick, plectrum, plectron",
1735
+ "pickelhaube",
1736
+ "picket fence, paling",
1737
+ "pickup, pickup truck",
1738
+ "pier",
1739
+ "piggy bank, penny bank",
1740
+ "pill bottle",
1741
+ "pillow",
1742
+ "ping-pong ball",
1743
+ "pinwheel",
1744
+ "pirate, pirate ship",
1745
+ "pitcher, ewer",
1746
+ "plane, carpenter's plane, woodworking plane",
1747
+ "planetarium",
1748
+ "plastic bag",
1749
+ "plate rack",
1750
+ "plow, plough",
1751
+ "plunger, plumber's helper",
1752
+ "Polaroid camera, Polaroid Land camera",
1753
+ "pole",
1754
+ "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",
1755
+ "poncho",
1756
+ "pool table, billiard table, snooker table",
1757
+ "pop bottle, soda bottle",
1758
+ "pot, flowerpot",
1759
+ "potter's wheel",
1760
+ "power drill",
1761
+ "prayer rug, prayer mat",
1762
+ "printer",
1763
+ "prison, prison house",
1764
+ "projectile, missile",
1765
+ "projector",
1766
+ "puck, hockey puck",
1767
+ "punching bag, punch bag, punching ball, punchball",
1768
+ "purse",
1769
+ "quill, quill pen",
1770
+ "quilt, comforter, comfort, puff",
1771
+ "racer, race car, racing car",
1772
+ "racket, racquet",
1773
+ "radiator",
1774
+ "radio, wireless",
1775
+ "radio telescope, radio reflector",
1776
+ "rain barrel",
1777
+ "recreational vehicle, RV, R.V.",
1778
+ "reel",
1779
+ "reflex camera",
1780
+ "refrigerator, icebox",
1781
+ "remote control, remote",
1782
+ "restaurant, eating house, eating place, eatery",
1783
+ "revolver, six-gun, six-shooter",
1784
+ "rifle",
1785
+ "rocking chair, rocker",
1786
+ "rotisserie",
1787
+ "rubber eraser, rubber, pencil eraser",
1788
+ "rugby ball",
1789
+ "rule, ruler",
1790
+ "running shoe",
1791
+ "safe",
1792
+ "safety pin",
1793
+ "saltshaker, salt shaker",
1794
+ "sandal",
1795
+ "sarong",
1796
+ "sax, saxophone",
1797
+ "scabbard",
1798
+ "scale, weighing machine",
1799
+ "school bus",
1800
+ "schooner",
1801
+ "scoreboard",
1802
+ "screen, CRT screen",
1803
+ "screw",
1804
+ "screwdriver",
1805
+ "seat belt, seatbelt",
1806
+ "sewing machine",
1807
+ "shield, buckler",
1808
+ "shoe shop, shoe-shop, shoe store",
1809
+ "shoji",
1810
+ "shopping basket",
1811
+ "shopping cart",
1812
+ "shovel",
1813
+ "shower cap",
1814
+ "shower curtain",
1815
+ "ski",
1816
+ "ski mask",
1817
+ "sleeping bag",
1818
+ "slide rule, slipstick",
1819
+ "sliding door",
1820
+ "slot, one-armed bandit",
1821
+ "snorkel",
1822
+ "snowmobile",
1823
+ "snowplow, snowplough",
1824
+ "soap dispenser",
1825
+ "soccer ball",
1826
+ "sock",
1827
+ "solar dish, solar collector, solar furnace",
1828
+ "sombrero",
1829
+ "soup bowl",
1830
+ "space bar",
1831
+ "space heater",
1832
+ "space shuttle",
1833
+ "spatula",
1834
+ "speedboat",
1835
+ "spider web, spider's web",
1836
+ "spindle",
1837
+ "sports car, sport car",
1838
+ "spotlight, spot",
1839
+ "stage",
1840
+ "steam locomotive",
1841
+ "steel arch bridge",
1842
+ "steel drum",
1843
+ "stethoscope",
1844
+ "stole",
1845
+ "stone wall",
1846
+ "stopwatch, stop watch",
1847
+ "stove",
1848
+ "strainer",
1849
+ "streetcar, tram, tramcar, trolley, trolley car",
1850
+ "stretcher",
1851
+ "studio couch, day bed",
1852
+ "stupa, tope",
1853
+ "submarine, pigboat, sub, U-boat",
1854
+ "suit, suit of clothes",
1855
+ "sundial",
1856
+ "sunglass",
1857
+ "sunglasses, dark glasses, shades",
1858
+ "sunscreen, sunblock, sun blocker",
1859
+ "suspension bridge",
1860
+ "swab, swob, mop",
1861
+ "sweatshirt",
1862
+ "swimming trunks, bathing trunks",
1863
+ "swing",
1864
+ "switch, electric switch, electrical switch",
1865
+ "syringe",
1866
+ "table lamp",
1867
+ "tank, army tank, armored combat vehicle, armoured combat vehicle",
1868
+ "tape player",
1869
+ "teapot",
1870
+ "teddy, teddy bear",
1871
+ "television, television system",
1872
+ "tennis ball",
1873
+ "thatch, thatched roof",
1874
+ "theater curtain, theatre curtain",
1875
+ "thimble",
1876
+ "thresher, thrasher, threshing machine",
1877
+ "throne",
1878
+ "tile roof",
1879
+ "toaster",
1880
+ "tobacco shop, tobacconist shop, tobacconist",
1881
+ "toilet seat",
1882
+ "torch",
1883
+ "totem pole",
1884
+ "tow truck, tow car, wrecker",
1885
+ "toyshop",
1886
+ "tractor",
1887
+ "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",
1888
+ "tray",
1889
+ "trench coat",
1890
+ "tricycle, trike, velocipede",
1891
+ "trimaran",
1892
+ "tripod",
1893
+ "triumphal arch",
1894
+ "trolleybus, trolley coach, trackless trolley",
1895
+ "trombone",
1896
+ "tub, vat",
1897
+ "turnstile",
1898
+ "typewriter keyboard",
1899
+ "umbrella",
1900
+ "unicycle, monocycle",
1901
+ "upright, upright piano",
1902
+ "vacuum, vacuum cleaner",
1903
+ "vase",
1904
+ "vault",
1905
+ "velvet",
1906
+ "vending machine",
1907
+ "vestment",
1908
+ "viaduct",
1909
+ "violin, fiddle",
1910
+ "volleyball",
1911
+ "waffle iron",
1912
+ "wall clock",
1913
+ "wallet, billfold, notecase, pocketbook",
1914
+ "wardrobe, closet, press",
1915
+ "warplane, military plane",
1916
+ "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
1917
+ "washer, automatic washer, washing machine",
1918
+ "water bottle",
1919
+ "water jug",
1920
+ "water tower",
1921
+ "whiskey jug",
1922
+ "whistle",
1923
+ "wig",
1924
+ "window screen",
1925
+ "window shade",
1926
+ "Windsor tie",
1927
+ "wine bottle",
1928
+ "wing",
1929
+ "wok",
1930
+ "wooden spoon",
1931
+ "wool, woolen, woollen",
1932
+ "worm fence, snake fence, snake-rail fence, Virginia fence",
1933
+ "wreck",
1934
+ "yawl",
1935
+ "yurt",
1936
+ "web site, website, internet site, site",
1937
+ "comic book",
1938
+ "crossword puzzle, crossword",
1939
+ "street sign",
1940
+ "traffic light, traffic signal, stoplight",
1941
+ "book jacket, dust cover, dust jacket, dust wrapper",
1942
+ "menu",
1943
+ "plate",
1944
+ "guacamole",
1945
+ "consomme",
1946
+ "hot pot, hotpot",
1947
+ "trifle",
1948
+ "ice cream, icecream",
1949
+ "ice lolly, lolly, lollipop, popsicle",
1950
+ "French loaf",
1951
+ "bagel, beigel",
1952
+ "pretzel",
1953
+ "cheeseburger",
1954
+ "hotdog, hot dog, red hot",
1955
+ "mashed potato",
1956
+ "head cabbage",
1957
+ "broccoli",
1958
+ "cauliflower",
1959
+ "zucchini, courgette",
1960
+ "spaghetti squash",
1961
+ "acorn squash",
1962
+ "butternut squash",
1963
+ "cucumber, cuke",
1964
+ "artichoke, globe artichoke",
1965
+ "bell pepper",
1966
+ "cardoon",
1967
+ "mushroom",
1968
+ "Granny Smith",
1969
+ "strawberry",
1970
+ "orange",
1971
+ "lemon",
1972
+ "fig",
1973
+ "pineapple, ananas",
1974
+ "banana",
1975
+ "jackfruit, jak, jack",
1976
+ "custard apple",
1977
+ "pomegranate",
1978
+ "hay",
1979
+ "carbonara",
1980
+ "chocolate sauce, chocolate syrup",
1981
+ "dough",
1982
+ "meat loaf, meatloaf",
1983
+ "pizza, pizza pie",
1984
+ "potpie",
1985
+ "burrito",
1986
+ "red wine",
1987
+ "espresso",
1988
+ "cup",
1989
+ "eggnog",
1990
+ "alp",
1991
+ "bubble",
1992
+ "cliff, drop, drop-off",
1993
+ "coral reef",
1994
+ "geyser",
1995
+ "lakeside, lakeshore",
1996
+ "promontory, headland, head, foreland",
1997
+ "sandbar, sand bar",
1998
+ "seashore, coast, seacoast, sea-coast",
1999
+ "valley, vale",
2000
+ "volcano",
2001
+ "ballplayer, baseball player",
2002
+ "groom, bridegroom",
2003
+ "scuba diver",
2004
+ "rapeseed",
2005
+ "daisy",
2006
+ "yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
2007
+ "corn",
2008
+ "acorn",
2009
+ "hip, rose hip, rosehip",
2010
+ "buckeye, horse chestnut, conker",
2011
+ "coral fungus",
2012
+ "agaric",
2013
+ "gyromitra",
2014
+ "stinkhorn, carrion fungus",
2015
+ "earthstar",
2016
+ "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",
2017
+ "bolete",
2018
+ "ear, spike, capitulum",
2019
+ "toilet tissue, toilet paper, bathroom tissue"
2020
+ ],
2021
+ "detection_num_classes": 80,
2022
+ "torch_dtype": "float32",
2023
+ "detection_hidden": 160,
2024
+ "detection_n_std_layers": 5,
2025
+ "detection_n_dw_layers": 4,
2026
+ "detection_n_scales": 4,
2027
+ "detection_pos_emb_dim": 64,
2028
+ "detection_text_embed_dim": 768
2029
+ }
model.bf16_backbone.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccd2f51f285eef54c7b1466e92d3da83c13f410e9476db70b614663f2825fe9a
3
+ size 240885372
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1837c48d63921ddc18e9fefa5adcf8292ab36728974bb7c9f14c16b1f71ea3d0
3
+ size 412169724
rf100vl_zero_shot_cross_domain_eval.json ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "picker": {
3
+ "path": "/mnt/d/detection-heads/heads/cofiber_threshold/split_tower_5scale_160h_5std_4dw_ema_l14_16ep_768_cls_calib/checkpoint_final.pth",
4
+ "n_params": 2975067,
5
+ "text_embed_dim": 768
6
+ },
7
+ "fcos": {
8
+ "path": "/mnt/d/_tmp/argus_fcos_head.pth",
9
+ "n_params": 16138074
10
+ },
11
+ "cache_dir": "/home/zootest/datasets/rf100vl_val_cache_768",
12
+ "resolution": 768,
13
+ "score_thresh": 0.05,
14
+ "max_per_image": 100,
15
+ "domains": [
16
+ {
17
+ "domain": "actions",
18
+ "n_items": 818,
19
+ "n_domain_classes": 6,
20
+ "domain_class_names": [
21
+ "Attack",
22
+ "Block",
23
+ "Defense",
24
+ "Serve",
25
+ "Set",
26
+ "ball"
27
+ ],
28
+ "mode_a_class_agnostic": {
29
+ "picker": {
30
+ "AR@100": 0.3958944594145987
31
+ },
32
+ "fcos": {
33
+ "AR@100": 0.3750611259658937
34
+ },
35
+ "delta_AR@100": 0.020833333448705027
36
+ },
37
+ "mode_b_text_swap_picker": null,
38
+ "mode_b_error": "skipped (--skip-text-swap)"
39
+ },
40
+ {
41
+ "domain": "aerial-airport",
42
+ "n_items": 66,
43
+ "n_domain_classes": 1,
44
+ "domain_class_names": [
45
+ "airplane"
46
+ ],
47
+ "mode_a_class_agnostic": {
48
+ "picker": {
49
+ "AR@100": 0.17270012670337706
50
+ },
51
+ "fcos": {
52
+ "AR@100": 0.16126021817890984
53
+ },
54
+ "delta_AR@100": 0.011439908524467218
55
+ },
56
+ "mode_b_text_swap_picker": null,
57
+ "mode_b_error": "skipped (--skip-text-swap)"
58
+ },
59
+ {
60
+ "domain": "all-elements",
61
+ "n_items": 142,
62
+ "n_domain_classes": 10,
63
+ "domain_class_names": [
64
+ "Button",
65
+ "Check box",
66
+ "Checked Radio button",
67
+ "Checked box",
68
+ "Dropdown box",
69
+ "Dropdown expand",
70
+ "Icon",
71
+ "Radio button",
72
+ "Scroll bar",
73
+ "Text box"
74
+ ],
75
+ "mode_a_class_agnostic": {
76
+ "picker": {
77
+ "AR@100": 0.07888318364128051
78
+ },
79
+ "fcos": {
80
+ "AR@100": 0.02245109092417947
81
+ },
82
+ "delta_AR@100": 0.05643209271710104
83
+ },
84
+ "mode_b_text_swap_picker": null,
85
+ "mode_b_error": "skipped (--skip-text-swap)"
86
+ },
87
+ {
88
+ "domain": "aquarium-combined",
89
+ "n_items": 127,
90
+ "n_domain_classes": 7,
91
+ "domain_class_names": [
92
+ "fish",
93
+ "jellyfish",
94
+ "penguin",
95
+ "puffin",
96
+ "shark",
97
+ "starfish",
98
+ "stingray"
99
+ ],
100
+ "mode_a_class_agnostic": {
101
+ "picker": {
102
+ "AR@100": 0.5815294733842996
103
+ },
104
+ "fcos": {
105
+ "AR@100": 0.4753463245165629
106
+ },
107
+ "delta_AR@100": 0.10618314886773672
108
+ },
109
+ "mode_b_text_swap_picker": null,
110
+ "mode_b_error": "skipped (--skip-text-swap)"
111
+ },
112
+ {
113
+ "domain": "defect-detection",
114
+ "n_items": 375,
115
+ "n_domain_classes": 4,
116
+ "domain_class_names": [
117
+ "defective fishplate",
118
+ "fastener",
119
+ "missing fastener",
120
+ "non defective fishplate"
121
+ ],
122
+ "mode_a_class_agnostic": {
123
+ "picker": {
124
+ "AR@100": 0.0029333333333333334
125
+ },
126
+ "fcos": {
127
+ "AR@100": 0.0010666666666666667
128
+ },
129
+ "delta_AR@100": 0.0018666666666666666
130
+ },
131
+ "mode_b_text_swap_picker": null,
132
+ "mode_b_error": "skipped (--skip-text-swap)"
133
+ },
134
+ {
135
+ "domain": "dentalai",
136
+ "n_items": 253,
137
+ "n_domain_classes": 4,
138
+ "domain_class_names": [
139
+ "Cavity",
140
+ "Fillings",
141
+ "Impacted Tooth",
142
+ "Implant"
143
+ ],
144
+ "mode_a_class_agnostic": {
145
+ "picker": {
146
+ "AR@100": 0.00916757190769369
147
+ },
148
+ "fcos": {
149
+ "AR@100": 0.0018475266961240958
150
+ },
151
+ "delta_AR@100": 0.007320045211569594
152
+ },
153
+ "mode_b_text_swap_picker": null,
154
+ "mode_b_error": "skipped (--skip-text-swap)"
155
+ },
156
+ {
157
+ "domain": "flir-camera-objects",
158
+ "n_items": 2513,
159
+ "n_domain_classes": 4,
160
+ "domain_class_names": [
161
+ "bicycle",
162
+ "car",
163
+ "dog",
164
+ "person"
165
+ ],
166
+ "mode_a_class_agnostic": {
167
+ "picker": {
168
+ "AR@100": 0.5425524384191743
169
+ },
170
+ "fcos": {
171
+ "AR@100": 0.5309809631263704
172
+ },
173
+ "delta_AR@100": 0.01157147529280389
174
+ },
175
+ "mode_b_text_swap_picker": null,
176
+ "mode_b_error": "skipped (--skip-text-swap)"
177
+ },
178
+ {
179
+ "domain": "gwhd2021",
180
+ "n_items": 1278,
181
+ "n_domain_classes": 1,
182
+ "domain_class_names": [
183
+ "whd"
184
+ ],
185
+ "mode_a_class_agnostic": {
186
+ "picker": {
187
+ "AR@100": 0.0147656631658906
188
+ },
189
+ "fcos": {
190
+ "AR@100": 0.017273207562968296
191
+ },
192
+ "delta_AR@100": -0.002507544397077696
193
+ },
194
+ "mode_b_text_swap_picker": null,
195
+ "mode_b_error": "skipped (--skip-text-swap)"
196
+ },
197
+ {
198
+ "domain": "lacrosse-object-detection",
199
+ "n_items": 100,
200
+ "n_domain_classes": 4,
201
+ "domain_class_names": [
202
+ "Goalie",
203
+ "Longpole",
204
+ "Referee",
205
+ "Shortstick"
206
+ ],
207
+ "mode_a_class_agnostic": {
208
+ "picker": {
209
+ "AR@100": 0.6658460140824318
210
+ },
211
+ "fcos": {
212
+ "AR@100": 0.5788187006562948
213
+ },
214
+ "delta_AR@100": 0.08702731342613701
215
+ },
216
+ "mode_b_text_swap_picker": null,
217
+ "mode_b_error": "skipped (--skip-text-swap)"
218
+ },
219
+ {
220
+ "domain": "new-defects-in-wood",
221
+ "n_items": 253,
222
+ "n_domain_classes": 5,
223
+ "domain_class_names": [
224
+ "Crack",
225
+ "Dead knot",
226
+ "Holes",
227
+ "Live knot",
228
+ "knot with crack"
229
+ ],
230
+ "mode_a_class_agnostic": {
231
+ "picker": {
232
+ "AR@100": 0.14613965756219366
233
+ },
234
+ "fcos": {
235
+ "AR@100": 0.056389986832622495
236
+ },
237
+ "delta_AR@100": 0.08974967072957116
238
+ },
239
+ "mode_b_text_swap_picker": null,
240
+ "mode_b_error": "skipped (--skip-text-swap)"
241
+ },
242
+ {
243
+ "domain": "orionproducts",
244
+ "n_items": 117,
245
+ "n_domain_classes": 8,
246
+ "domain_class_names": [
247
+ "Candy Boom",
248
+ "Chocopie Dark",
249
+ "Chocopie Nor",
250
+ "Marine Boy",
251
+ "OStar Red",
252
+ "OStar Yellow",
253
+ "Swing Maxx",
254
+ "Swing Nor"
255
+ ],
256
+ "mode_a_class_agnostic": {
257
+ "picker": {
258
+ "AR@100": 0.25509543324000816
259
+ },
260
+ "fcos": {
261
+ "AR@100": 0.17056578116921278
262
+ },
263
+ "delta_AR@100": 0.08452965207079538
264
+ },
265
+ "mode_b_text_swap_picker": null,
266
+ "mode_b_error": "skipped (--skip-text-swap)"
267
+ },
268
+ {
269
+ "domain": "paper-parts",
270
+ "n_items": 2407,
271
+ "n_domain_classes": 19,
272
+ "domain_class_names": [
273
+ "author",
274
+ "chapter",
275
+ "equation",
276
+ "equation number",
277
+ "figure",
278
+ "figure caption",
279
+ "footnote",
280
+ "list of content heading",
281
+ "list of content text",
282
+ "page number",
283
+ "paragraph",
284
+ "reference text",
285
+ "section",
286
+ "subsection",
287
+ "subsubsection",
288
+ "table",
289
+ "table caption",
290
+ "table of contents text",
291
+ "title"
292
+ ],
293
+ "mode_a_class_agnostic": {
294
+ "picker": {
295
+ "AR@100": 0.22175769264448789
296
+ },
297
+ "fcos": {
298
+ "AR@100": 0.19341745339612595
299
+ },
300
+ "delta_AR@100": 0.02834023924836193
301
+ },
302
+ "mode_b_text_swap_picker": null,
303
+ "mode_b_error": "skipped (--skip-text-swap)"
304
+ },
305
+ {
306
+ "domain": "recode-waste",
307
+ "n_items": 500,
308
+ "n_domain_classes": 6,
309
+ "domain_class_names": [
310
+ "aggregate",
311
+ "cardboard",
312
+ "hard plastic",
313
+ "metal",
314
+ "soft plastic",
315
+ "timber"
316
+ ],
317
+ "mode_a_class_agnostic": {
318
+ "picker": {
319
+ "AR@100": 0.11759555092621594
320
+ },
321
+ "fcos": {
322
+ "AR@100": 0.11386011148523538
323
+ },
324
+ "delta_AR@100": 0.0037354394409805647
325
+ },
326
+ "mode_b_text_swap_picker": null,
327
+ "mode_b_error": "skipped (--skip-text-swap)"
328
+ },
329
+ {
330
+ "domain": "soda-bottles",
331
+ "n_items": 449,
332
+ "n_domain_classes": 3,
333
+ "domain_class_names": [
334
+ "coca-cola",
335
+ "fanta",
336
+ "sprite"
337
+ ],
338
+ "mode_a_class_agnostic": {
339
+ "picker": {
340
+ "AR@100": 0.3584433421534061
341
+ },
342
+ "fcos": {
343
+ "AR@100": 0.29591111459858493
344
+ },
345
+ "delta_AR@100": 0.06253222755482118
346
+ },
347
+ "mode_b_text_swap_picker": null,
348
+ "mode_b_error": "skipped (--skip-text-swap)"
349
+ },
350
+ {
351
+ "domain": "the-dreidel-project",
352
+ "n_items": 108,
353
+ "n_domain_classes": 6,
354
+ "domain_class_names": [
355
+ "Dreidel",
356
+ "Gimel",
357
+ "Hay",
358
+ "Nun",
359
+ "Shin",
360
+ "Spinning Dreidel"
361
+ ],
362
+ "mode_a_class_agnostic": {
363
+ "picker": {
364
+ "AR@100": 0.6514730654725874
365
+ },
366
+ "fcos": {
367
+ "AR@100": 0.5772390587393332
368
+ },
369
+ "delta_AR@100": 0.0742340067332542
370
+ },
371
+ "mode_b_text_swap_picker": null,
372
+ "mode_b_error": "skipped (--skip-text-swap)"
373
+ },
374
+ {
375
+ "domain": "trail-camera",
376
+ "n_items": 261,
377
+ "n_domain_classes": 2,
378
+ "domain_class_names": [
379
+ "Deer",
380
+ "Hog"
381
+ ],
382
+ "mode_a_class_agnostic": {
383
+ "picker": {
384
+ "AR@100": 0.6955832277392519
385
+ },
386
+ "fcos": {
387
+ "AR@100": 0.6009897838607146
388
+ },
389
+ "delta_AR@100": 0.0945934438785373
390
+ },
391
+ "mode_b_text_swap_picker": null,
392
+ "mode_b_error": "skipped (--skip-text-swap)"
393
+ },
394
+ {
395
+ "domain": "water-meter",
396
+ "n_items": 136,
397
+ "n_domain_classes": 10,
398
+ "domain_class_names": [
399
+ "0",
400
+ "1",
401
+ "2",
402
+ "3",
403
+ "4",
404
+ "5",
405
+ "6",
406
+ "7",
407
+ "8",
408
+ "9"
409
+ ],
410
+ "mode_a_class_agnostic": {
411
+ "picker": {
412
+ "AR@100": 0.0003676470588235294
413
+ },
414
+ "fcos": {
415
+ "AR@100": 0.00665441194877905
416
+ },
417
+ "delta_AR@100": -0.006286764889955521
418
+ },
419
+ "mode_b_text_swap_picker": null,
420
+ "mode_b_error": "skipped (--skip-text-swap)"
421
+ },
422
+ {
423
+ "domain": "wb-prova",
424
+ "n_items": 289,
425
+ "n_domain_classes": 3,
426
+ "domain_class_names": [
427
+ "Adult",
428
+ "Juvenile",
429
+ "Piglet"
430
+ ],
431
+ "mode_a_class_agnostic": {
432
+ "picker": {
433
+ "AR@100": 0.8620688187519159
434
+ },
435
+ "fcos": {
436
+ "AR@100": 0.8358979204224879
437
+ },
438
+ "delta_AR@100": 0.026170898329427983
439
+ },
440
+ "mode_b_text_swap_picker": null,
441
+ "mode_b_error": "skipped (--skip-text-swap)"
442
+ },
443
+ {
444
+ "domain": "wildfire-smoke",
445
+ "n_items": 147,
446
+ "n_domain_classes": 1,
447
+ "domain_class_names": [
448
+ "smoke"
449
+ ],
450
+ "mode_a_class_agnostic": {
451
+ "picker": {
452
+ "AR@100": 0.005442176870748299
453
+ },
454
+ "fcos": {
455
+ "AR@100": 0.0034013605442176865
456
+ },
457
+ "delta_AR@100": 0.0020408163265306124
458
+ },
459
+ "mode_b_text_swap_picker": null,
460
+ "mode_b_error": "skipped (--skip-text-swap)"
461
+ },
462
+ {
463
+ "domain": "x-ray-id",
464
+ "n_items": 767,
465
+ "n_domain_classes": 6,
466
+ "domain_class_names": [
467
+ "DIP",
468
+ "MCP",
469
+ "PIP",
470
+ "Radius",
471
+ "Ulna",
472
+ "Wrist"
473
+ ],
474
+ "mode_a_class_agnostic": {
475
+ "picker": {
476
+ "AR@100": 0.0
477
+ },
478
+ "fcos": {
479
+ "AR@100": 1.5338599642998546e-05
480
+ },
481
+ "delta_AR@100": -1.5338599642998546e-05
482
+ },
483
+ "mode_b_text_swap_picker": null,
484
+ "mode_b_error": "skipped (--skip-text-swap)"
485
+ }
486
+ ],
487
+ "aggregate": {
488
+ "n_domains": 20,
489
+ "mode_a_picker_AR100_mean": 0.2889119438235859,
490
+ "mode_a_fcos_AR100_mean": 0.25092240729454635,
491
+ "mode_a_delta_mean": 0.037989536529039566,
492
+ "mode_a_picker_wins": 17,
493
+ "mode_a_fcos_wins": 3,
494
+ "mode_a_ties": 0,
495
+ "mode_b_n_domains_valid": 0,
496
+ "mode_b_picker_text_swap_mAP_mean": 0.0,
497
+ "mode_b_picker_text_swap_mAP_median": 0.0
498
+ }
499
+ }