update repository

Browse files

Files changed (8) hide show

.gitattributes +160 -0
LICENSE +127 -0
README.md +320 -0
argus.py +2257 -0
config.json +2029 -0
model.bf16_backbone.safetensors +3 -0
model.safetensors +3 -0
rf100vl_zero_shot_cross_domain_eval.json +499 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,160 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000002.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000008.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000031.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000058.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000083.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000089.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000191.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000400.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000436.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000452.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000496.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000557.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000575.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000591.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000600.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000665.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000696.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000773.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000781.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000793.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000868.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000880.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000889.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000939.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000957.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/000998.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001057.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001078.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001118.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001191.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001289.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001348.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001394.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001430.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001586.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001587.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001599.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001624.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001631.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001634.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001685.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001741.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001763.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001851.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001934.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001949.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001951.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/001983.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002022.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002070.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002086.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002186.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002284.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002334.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002450.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002468.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002489.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002497.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002514.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002538.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002553.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002586.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002592.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002598.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002640.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002660.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002671.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002799.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002905.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002939.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/002953.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003084.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003132.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003148.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003254.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003344.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003391.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003420.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003502.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003541.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003614.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003665.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003713.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003754.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003805.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003871.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003880.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003911.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/003978.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004039.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004066.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004082.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004096.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004131.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004170.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004172.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004263.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004304.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004315.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004329.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004371.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004431.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004510.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004514.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004517.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004525.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004535.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004546.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004548.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004557.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004585.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004590.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004610.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004627.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004651.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004656.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004702.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004743.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004755.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004775.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004781.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004831.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004852.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004939.jpg filter=lfs diff=lfs merge=lfs -text
+quickstart/data/004978.jpg filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,127 @@

+# FAIR Noncommercial Research License
+*v1 Last Updated: August 18, 2025*
+**"Acceptable Use Policy"** means the FAIR Acceptable Use Policy, applicable to Research Materials, that is incorporated into this Agreement.
+**"Agreement"** means the terms and conditions for use, reproduction, distribution and modification of the Research Materials set forth herein.
+**"Documentation"** means the specifications, manuals and documentation accompanying
+Research Materials distributed by Meta.
+**"Licensee"** or **"you"** means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity's behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+**"Meta"** or **"we"** means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
+**"Noncommercial Research Uses"** means noncommercial research use cases related to research, development, education, processing, or analysis and in each case, is not primarily intended for commercial advantage or monetary compensation to you or others.
+**"Research Materials"** means, collectively, Documentation and the models, software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code, demonstration materials and other elements of the foregoing distributed by Meta and made available under this Agreement.
+By clicking "I Accept" below or by using or distributing any portion or element of the Research Materials, you agree to be bound by this Agreement.
+## 1. License Rights and Redistribution.
+a. <ins>Grant of Rights</ins>. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta's intellectual property or other rights owned by Meta embodied in the Research Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Research Materials.
+b. <ins>Redistribution and Use</ins>.
+i. You will not use the Research Materials or any outputs or results of the Research Materials in connection with any commercial uses or for any uses other than Noncommercial Research Uses;
+ii. Distribution of Research Materials, and any derivative works thereof, are subject to the terms of this Agreement. If you distribute or make the Research Materials, or any derivative works thereof, available to a third party, you may only do so under the terms of this Agreement. You shall also provide a copy of this Agreement to such third party.
+iii.  If you submit for publication the results of research you perform on, using, or otherwise in connection with Research Materials, you must acknowledge the use of Research Materials in your publication.
+iv. Your use of the Research Materials must comply with applicable laws and regulations (including Trade Control Laws) and adhere to the FAIR Acceptable Use Policy, which is hereby incorporated by reference into this Agreement.
+## 2. User Support.
+Your Noncommercial Research Use of the Research Materials is done at your own discretion; Meta does not process any information nor provide any service in relation to such use.  Meta is under no obligation to provide any support services for the Research Materials. Any support provided is "as is", "with all faults", and without warranty of any kind.
+## 3. Disclaimer of Warranty.
+UNLESS REQUIRED BY APPLICABLE LAW, THE RESEARCH MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE RESEARCH MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE RESEARCH MATERIALS AND ANY OUTPUT AND RESULTS.
+## 4. Limitation of Liability.
+IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT OR INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+## 5. Intellectual Property.
+a. Subject to Meta's ownership of Research Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Research Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
+b. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Research Materials, outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Research Materials.
+## 6. Term and Termination.
+The term of this Agreement will commence upon your acceptance of this Agreement or access to the Research Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Research Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement.
+## 7. Governing Law and Jurisdiction.
+This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
+## 8. Modifications and Amendments.
+Meta may modify this Agreement from time to time; provided that they are similar in spirit to the current version of the Agreement, but may differ in detail to address new problems or concerns. All such changes will be effective immediately. Your continued use of the Research Materials after any modification to this Agreement constitutes your agreement to such modification. Except as provided in this Agreement, no modification or addition to any provision of this Agreement will be binding unless it is in writing and signed by an authorized representative of both you and Meta.
+## FAIR Acceptable Use Policy
+The Fundamental AI Research (FAIR) team at Meta seeks to further understanding of new and existing research domains with the mission of advancing the state-of-the-art in artificial intelligence through open research for the benefit of all.
+As part of this mission, Meta makes certain research materials available for noncommercial research use. Meta is committed to promoting the safe and responsible use of such research materials.
+### Prohibited Uses
+You agree you will not use, or allow others to use, Research Materials to:
+ Violate the law or others' rights, including to:
+Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as:
+Violence or terrorism
+Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material
+Human trafficking, exploitation, and sexual violence
+The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials.
+Sexual solicitation
+Any other criminal activity
+Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals
+Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services
+Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices
+Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws
+Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any technology using FAIR research materials
+Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system
+2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of research artifacts related to the following:
+Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are subject to the International Traffic Arms Regulations (ITAR) maintained by the United States Department of State
+Guns and illegal weapons (including weapon development)
+Illegal drugs and regulated/controlled substances
+Operation of critical infrastructure, transportation technologies, or heavy machinery
+Self-harm or harm to others, including suicide, cutting, and eating disorders
+Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual
+3. Intentionally deceive or mislead others, including use of FAIR Research Materials related to the following:
+ Generating, promoting, or furthering fraud or the creation or promotion of disinformation
+ Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content
+Generating, promoting, or further distributing spam
+ Impersonating another individual without consent, authorization, or legal right
+Representing that outputs of FAIR research materials or outputs from technology using FAIR research materials are human-generated
+Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement
+4. Fail to appropriately disclose to end users any known dangers of your Research Materials.
+Please report any violation of this Policy or other problems that could lead to a violation of this Policy by submitting a report [here](https://docs.google.com/forms/d/e/1FAIpQLSeb11cryAopJ7LNrC4nxEUXrHY26hfkXQMf_uH-oFgA3WlYZQ/viewform).

README.md ADDED Viewed

	@@ -0,0 +1,320 @@

+---
+license: other
+license_name: fair-research-license
+license_link: LICENSE
+base_model: facebook/EUPE-ViT-B
+tags:
+  - multi-task-perception
+  - computer-vision
+  - image-classification
+  - semantic-segmentation
+  - depth-estimation
+  - object-detection
+  - keypoint-correspondence
+  - vision-transformer
+library_name: pytorch
+datasets:
+  - imagenet-1k
+  - scene_parse_150
+  - sayakpaul/nyu_depth_v2
+  - detection-datasets/coco
+metrics:
+  - accuracy
+  - mean_iou
+  - mAP
+---
+# Argus
+Argus is a multi-task perception system built on a single frozen vision backbone. One forward pass through the encoder produces classification labels, semantic segmentation masks, metric depth maps, object detections, and dense keypoint correspondences. Roughly 103M parameters total, with the 86M backbone frozen and about 17.3M learnable across five task heads. Named after Argus Panoptes, the many-eyed giant of Greek mythology tasked with watching over everything at once.
+The backbone is [EUPE-ViT-B](https://huggingface.co/facebook/EUPE-ViT-B), introduced in *Efficient Universal Perception Encoder* (Zhu et al., Meta FAIR, [arXiv:2603.22387](https://arxiv.org/abs/2603.22387), March 2026). EUPE distills a small vision encoder from a collection of larger specialist teachers, producing features that transfer well to image understanding, dense prediction, and vision-language tasks simultaneously. Argus leaves those weights frozen and attaches five lightweight heads.
+## Architecture
+```
+Image → EUPE-ViT-B (frozen, 86M) → shared features
+  ├── Classification   trained linear softmax, 1000 ImageNet classes
+  ├── Segmentation     BN + 1×1 Conv, 150 ADE20K classes
+  ├── Depth            DPT multi-scale decoder, metric depth (meters), NYU Depth V2
+  ├── Detection        split-tower on a multi-scale feature decomposition, 80 COCO classes
+  └── Correspondence   training-free dense feature matching
+```
+| Head | Params | Description |
+|---|---|---|
+| Classification | 769K | `Linear(768, 1000)` softmax on the L2-normalized CLS token |
+| Segmentation | 117K | `BatchNorm2d(768) → Conv2d(768, 150, 1×1)` at stride 16, bilinear-upsampled to input resolution |
+| Depth | 13.45M | DPT fusing backbone blocks [2, 5, 8, 11], 256 depth bins over 0.001 to 10 m |
+| Detection | 2.98M | 5 prediction levels at strides [8, 16, 32, 64, 128], cosine similarity against CLIP ViT-L/14 text embeddings |
+| Correspondence | 0 | cosine-max on backbone spatial features |
+## Benchmarks
+### EUPE paper reproduction
+All four reported benchmarks were reproduced as part of building Argus.
+| Task | Dataset | Metric | Paper | Argus | Delta |
+|---|---|---|---|---|---|
+| Classification | ImageNet-1k | kNN k=10 top-1 | 84.1 | 84.07 | −0.03 |
+| Segmentation | ADE20K | mean IoU | 52.4 | 52.72 | +0.32 |
+| Depth | NYU Depth V2 | RMSE (lower is better) | 0.391 | 0.3914 | +0.0004 |
+| Correspondence | SPair-71k | PCK@0.1 | 51.3 | 54.35 | +3.05 |
+### Shipped task metrics
+| Task | Dataset | Metric | Value |
+|---|---|---|---|
+| Classification | ImageNet-1k val | top-1 / top-5 | 85.53 / 97.69 |
+| Segmentation | ADE20K val | mIoU | 52.72 |
+| Depth | NYU Depth V2 test | RMSE / abs_rel / a1 | 0.480 / 0.219 / 0.872 |
+| Detection | COCO val2017 | mAP @[.5:.95] | 42.64 (42.71 soft NMS) |
+| Correspondence | SPair-71k | PCK@0.1 | 54.35 |
+The shipped classifier is a trained linear softmax layer (85.53% top-1) that superseded the kNN protocol used during paper reproduction. The shipped depth head is a DPT decoder that improves RMSE by 8% and abs_rel by 28% over a linear probe on the same backbone (0.480 vs 0.520 RMSE).
+### Detection detail (COCO val2017)
+| Metric | Value |
+|---|---|
+| mAP@[0.5:0.95] | **42.64** |
+| mAP@0.50 | 65.70 |
+| mAP@0.75 | 45.10 |
+| mAP (small / medium / large) | 22.31 / 48.33 / 62.90 |
+At 2.98M learnable parameters the detection head passes the 16.14M FCOS simple-feature-pyramid baseline (41.0 mAP) by +1.64, using 18.4% of its head parameter budget. Small-object mAP is 22.3 against FCOS's 19.4 (+2.9). The backbone was never exposed to detection data; these are the same frozen features used for every other task.
+Evaluation protocol: per-class hard NMS (IoU 0.5), score threshold 0.05, top-100 detections per image, pycocotools on COCO val2017.
+The standalone checkpoint and related detection-head work live in [phanerozoic/detection-heads](https://huggingface.co/phanerozoic/detection-heads).
+### Cross-Dataset Detection Transfer
+To test whether the detection head's features generalize beyond COCO, the shipping 2.98M detection head (trained on COCO 2017 at 768px) and the 16.14M FCOS baseline (trained on COCO 2017 at 640px) were each evaluated zero-shot against the 20 RF100-VL validation domains. Both heads saw only COCO during training; RF100-VL was never exposed to either. Evaluation is class-agnostic AR@100 (all detections relabeled to a single "object" class, all ground-truth boxes relabeled likewise) so that localization transfer can be measured even on domains whose label space does not overlap COCO-80.
+|                          domain                         | FCOS (16.1M) | Ours (3.0M) |   Δ   |
+|---------------------------------------------------------|-------------:|------------:|------:|
+| actions                                                 |         37.5 |        39.6 |  +2.1 |
+| aerial-airport                                          |         16.1 |        17.3 |  +1.1 |
+| all-elements                                            |          2.3 |         7.9 |  +5.6 |
+| aquarium-combined                                       |         47.5 |        58.2 | +10.6 |
+| defect-detection                                        |          0.1 |         0.3 |  +0.2 |
+| dentalai                                                |          0.2 |         0.9 |  +0.7 |
+| flir-camera-objects                                     |         53.1 |        54.3 |  +1.2 |
+| gwhd2021                                                |          1.7 |         1.5 |  -0.3 |
+| lacrosse-object-detection                               |         57.9 |        66.6 |  +8.7 |
+| new-defects-in-wood                                     |          5.6 |        14.6 |  +9.0 |
+| orionproducts                                           |         17.1 |        25.5 |  +8.5 |
+| paper-parts                                             |         19.3 |        22.2 |  +2.8 |
+| recode-waste                                            |         11.4 |        11.8 |  +0.4 |
+| soda-bottles                                            |         29.6 |        35.8 |  +6.3 |
+| the-dreidel-project                                     |         57.7 |        65.2 |  +7.4 |
+| trail-camera                                            |         60.1 |        69.6 |  +9.5 |
+| water-meter                                             |          0.7 |         0.0 |  -0.6 |
+| wb-prova                                                |         83.6 |        86.2 |  +2.6 |
+| wildfire-smoke                                          |          0.3 |         0.5 |  +0.2 |
+| x-ray-id                                                |          0.0 |         0.0 |   0.0 |
+| **RF100-VL AR@100 mean**                                |     **25.1** |    **28.9** | **+3.8** |
+| **Domain wins**                                         |        **3** |      **17** |       |
+The detection head wins 17 of 20 domains, loses 3, with mean AR@100 +3.8 over the 5× larger FCOS baseline. The largest gaps are on domains far from COCO's distribution: aquarium-combined (+10.6), trail-camera (+9.5), new-defects-in-wood (+9.0), lacrosse-object-detection (+8.7), orionproducts (+8.5), the-dreidel-project (+7.4), soda-bottles (+6.3), all-elements (+5.6). The three losses are small (≤0.6 AR) on domains with very low absolute AR for both heads (gwhd2021 wheat-head crops, water-meter digit reads, x-ray-id anatomical landmarks). The interpretation is that the backbone's multi-teacher distilled features produce representations general enough that a frozen head one-fifth the FCOS size transfers across wildly different visual domains at the same level or better.
+### Cross-Dataset Segmentation Transfer
+A separate BN+1×1 linear probe with the same training recipe as the ADE20K head, on the frozen backbone. The backbone was never exposed to driving scenes during EUPE distillation or Argus head training.
+| Dataset | Classes | Train images | mIoU |
+|---|---|---|---|
+| ADE20K (shipped head) | 150 | 20,210 | 52.72 |
+| Cityscapes (transfer probe) | 19 | 2,975 | 63.76 |
+The Cityscapes probe scores road 96.4, car 87.9, sky 88.8, building 86.7, vegetation 85.6. The weaker categories are thin vertical structures (pole 17.8, traffic light 36.4, traffic sign 48.3), which is an inherent resolution limitation of the stride-16 patch grid rather than a deficiency in the learned representation.
+## Comparison with standard baselines
+As a sanity check, Argus was compared against several well-known models on the same 200-image COCO subset. The classification comparison uses a keyword cross-reference between each model's top-k ImageNet predictions and the COCO ground-truth detection labels on those images, which provides a consistent yardstick across differently-trained models despite the label-space mismatch. **These hit rates measure agreement with COCO detection labels via keyword matching on the 200-image subset; they are not raw ImageNet accuracy.** For reference, all three classifiers exceed 80% top-1 on the full ImageNet validation set.
+**Classification** (hit rate against COCO detection labels, 200 images):
+| Model              | Parameters | Top-1 hit | Top-5 hit | Latency | Peak VRAM |
+|--------------------|------------|-----------|-----------|---------|-----------|
+| Argus (EUPE-ViT-B) | 86 M       | 42.2%     | 66.8%     | 13.1 ms | 0.34 GB   |
+| ConvNeXt-Base      | 89 M       | 40.2%     | 71.4%     | 10.4 ms | 0.35 GB   |
+| ResNet50           | 26 M       | 36.2%     | 61.8%     | 8.4 ms  | 0.12 GB   |
+**Segmentation**:
+| Model                      | Parameters | Classes | Latency | Peak VRAM |
+|----------------------------|------------|---------|---------|-----------|
+| Argus (EUPE + linear head) | 86 M       | 150     | 11.8 ms | 0.41 GB   |
+| DeepLabV3-ResNet50         | 42 M       | 21      | 15.9 ms | 0.33 GB   |
+**Depth**:
+| Model                      | Parameters | Latency | Peak VRAM |
+|----------------------------|------------|---------|-----------|
+| Argus (EUPE + linear head) | 86 M       | 13.3 ms | 0.35 GB   |
+| Depth-Anything-V2-Base     | 98 M       | 18.8 ms | 0.68 GB   |
+Argus produces the top-1 classification accuracy of the three image classifiers, with ConvNeXt-Base edging it slightly on top-5. The Argus classification row above was measured with the kNN method during the original head-to-head comparison; the current shipped classifier (trained linear softmax) would widen the top-5 margin. Argus is faster than DeepLabV3 while predicting a much richer label space, and it is faster than Depth-Anything-V2 while using roughly half the VRAM. Although these baselines and Argus were trained for different objectives on different datasets, the comparison is useful for understanding what the model delivers in practice.
+### Multi-Task Throughput
+The per-task comparisons above measure each head against its single-task counterpart in isolation. A separate question is what happens when a user needs all of the tasks at once, which is the typical situation in dataset annotation, model evaluation, and any pipeline where images pass through multiple analysis stages in sequence. The alternative to Argus in that situation is to load and run four separate single-task models of comparable quality, each carrying its own backbone, its own preprocessing, and its own forward pass. The total cost is the sum of the four individual inference times, plus the memory overhead of holding four independent models on the device simultaneously.
+The models chosen for this comparison were selected to match the quality tier of the EUPE-ViT-B backbone rather than to minimize size or maximize speed. ConvNeXt-Base (88.6M parameters) is a widely-used ImageNet-1k classifier at the same parameter scale as EUPE-ViT-B. SegFormer-B3 (47.3M) is a transformer-based ADE20K semantic segmenter that is the standard mid-range alternative to a linear probe on a frozen backbone. Depth-Anything-V2-Base (97.5M) is the current standard for single-image monocular depth estimation at base scale. YOLO26l (26.3M) is the large variant of the January 2026 YOLO release from Ultralytics, representing the state of the art in efficient real-time detection. All measurements were taken on an NVIDIA RTX 6000 Ada across the same nine example images, with five timed runs after a three-image warmup pass to eliminate cold-start effects.
+| Pipeline | Parameters | Latency per image | Tasks |
+|----------|-----------|-------------------|-------|
+| Argus unified | 103 M | 56 ms | 5 (classify, segment, depth, detect, correspond) |
+| Four separate models | 260 M | 68 ms | 4 (classify, segment, depth, detect) |
+The per-model breakdown for the separate pipeline is ConvNeXt-Base at 6 ms, SegFormer-B3 at 19 ms, Depth-Anything-V2-Base at 31 ms, and YOLO26l at 12 ms, summing to 68 ms when the tasks are run sequentially on the same image. Argus completes five tasks (the same four plus keypoint correspondence, which the separate pipeline does not attempt) in 56 ms from a single model load. The total parameter count for the separate pipeline is 260M across four independent weight sets, while Argus carries 103M in a single file.
+The throughput advantage comes from the shared backbone. Each of the four separate models pays the cost of encoding the image through its own network before producing task-specific output. Argus encodes the image once through EUPE-ViT-B and then routes the resulting features to five lightweight heads, each of which adds only a few milliseconds on top of the shared representation. The backbone forward pass is the dominant cost in both pipelines, and running it once rather than four times is where the 1.2x throughput improvement and 2.5x parameter reduction originate. The practical consequence for deployment is that Argus requires a single model download, a single checkpoint load, and a single Python import, where the equivalent separate-model pipeline requires four downloads totaling over a gigabyte, four independent weight sets held concurrently, and four separate dependency trees to manage.
+## Usage
+```python
+from PIL import Image
+from transformers import AutoModel
+model = AutoModel.from_pretrained("phanerozoic/argus", trust_remote_code=True)
+image = Image.open("your_image.jpg").convert("RGB")
+top5  = model.classify(image, top_k=5)
+seg   = model.segment(image)              # [H, W] class indices
+depth = model.depth(image)                # [H, W] metric depth in meters
+dets  = model.detect(image, score_thresh=0.3)
+# dets: list of {"box": [x1, y1, x2, y2], "score", "label", "class_name"}
+# Three tasks at once (shared backbone forward inside perceive)
+result = model.perceive(image)
+# result["classification"], ["segmentation"], ["depth"], ["timings_ms"]
+# Keypoint correspondence between two images
+target = Image.open("other_image.jpg").convert("RGB")
+predicted = model.correspond(image, target, [[100, 100], [200, 200]])
+```
+Every single-image method also accepts a list of PIL images and returns a list of per-image results in the same shape a single call would produce.
+### Confidence outputs
+```python
+seg_map, seg_conf   = model.segment(image, return_confidence=True)
+# seg_conf is per-pixel max softmax probability in [0, 1]
+depth_map, depth_std = model.depth(image, return_confidence=True)
+# depth_std is per-pixel standard deviation of the 256-bin distribution
+result = model.perceive(image, return_confidence=True)
+# result["segmentation_confidence"], result["depth_uncertainty"]
+```
+Classification always carries a `margin` field (top-1 minus top-2 score) on the first entry.
+### ONNX export
+```python
+paths = model.export_onnx("/path/to/out_dir", backbone_resolution=640, verify=True)
+# backbone, classifier, seg_head, depth_head, detection_head (five graphs)
+```
+The segmentation graph folds bilinear upsample to input resolution inside the graph, so consumers argmax directly. The classifier graph is self-contained (softmax weights captured as buffers). The depth head accepts four intermediate ViT-block activations as separate positional tensor inputs. The detection head returns pre-NMS per-location boxes and scores by default, or with `include_nms=True` bakes ONNX `NonMaxSuppression` (opset ≥ 10) into the detection graph for single-shot TensorRT or mobile inference. Correspondence has no learned parameters and needs no graph.
+Tolerance for `verify=True` can be a float or a dict keyed by verification output name. When a float is passed, detection box coordinates get a resolution-scaled tolerance because `exp()` in the regression path amplifies FP kernel-dispatch differences to pixel scale.
+### INT8 quantization
+```python
+model = AutoModel.from_pretrained("phanerozoic/argus", trust_remote_code=True)
+model = model.cuda().eval().quantize_int8()  # requires: pip install torchao
+```
+Weight-only INT8 quantization via torchao. Linear weights go to INT8; activations stay in BF16. Classification agreement with FP32 is 100%, depth drift averages 0.013 m. Reduces weight VRAM substantially. Latency behaviour depends on whether the target GPU has an INT8 tensor-core path torchao can dispatch to.
+### Precision variants
+Two safetensors with identical inference behaviour but different on-disk precision.
+| File | Load |
+|---|---|
+| `model.safetensors` | `AutoModel.from_pretrained("phanerozoic/argus", trust_remote_code=True)` |
+| `model.bf16_backbone.safetensors` | add `variant="bf16_backbone"` |
+Both load into the same FP32 model in memory; PyTorch upcasts the stored bfloat16 weights at construction. The smaller variant saves download bandwidth only.
+## Training
+The backbone is frozen for every task. Only the task heads are trained; the kNN class prototypes used during paper reproduction were extracted (not trained at all).
+| Component | Source | Method |
+|---|---|---|
+| Segmentation | ADE20K (20,210 train) | Linear probe, CE loss, AdamW lr 1e-3, 512×512, 40,000 iterations |
+| Depth | NYU Depth V2 (24,231 train) | DPT decoder, SILog loss, AdamW lr 1e-4, 416×416, 38,400 iterations |
+| Linear softmax classifier | ImageNet-1k (1.28M train) | Cached CLS features, SGD momentum 0.9, cosine LR, 100 epochs |
+| Detection | COCO 2017 (117,266 train) | Split-tower on a multi-scale decomposition of frozen features, ATSS, focal + GIoU + BCE, AdamW lr 5e-4, 768×768, 16 ep + 3 ep partial calibration |
+| Correspondence | none | training-free cosine similarity |
+### Backbone simplification
+The upstream EUPE-ViT-B release ships a `LinearKMaskedBias` wrapper around each block's QKV projection. In the released weights both the `bias_mask` and the `bias` are filled with zeros across all twelve blocks, so the masked bias is identically zero at every forward pass. The Argus backbone drops the 24 redundant tensors entirely (12 × `qkv.bias` + 12 × `qkv.bias_mask`, 55,296 values total), and the attention blocks are constructed with `qkv_bias=False, mask_k_bias=False`. FP32 forward is bitwise-equivalent for classification, segmentation, detection, and correspondence. The DPT depth decoder shows sub-centimeter drift under BF16 autocast; the drift is an order of magnitude smaller than the head's own 39-centimeter NYU Depth V2 RMSE and causes no visible change in depth maps. To load the upstream EUPE-ViT-B release directly into this backbone class, pass `strict=False` to `load_state_dict` so the extra keys in the upstream checkpoint are silently ignored.
+### Head details
+**Segmentation.** `BatchNorm2d(768) → Conv2d(768, 150, 1×1)`, 116,886 parameters. Trained at 512×512 with cross-entropy loss, AdamW (lr 1e-3, weight decay 1e-3), WarmupOneCycleLR with 1500-step warmup, batch 16.
+**Depth (DPT).** Hooks into backbone blocks [2, 5, 8, 11] via PyTorch forward hooks, capturing intermediate representations without modifying the backbone. A reassemble stage projects each block's output from 768 to 256 channels via LayerNorm + Linear, reshapes to spatial grids, and rescales to strides [4, 8, 16, 32]. A bottom-up fusion path combines the four scales through residual conv blocks with skip connections. A final conv head produces 256 depth-bin logits; metric depth is the bin-weighted sum. 13,450,000 parameters. Trained at 416×416 with SILog loss, AdamW (lr 1e-4, weight decay 1e-3), cosine schedule with 3% warmup, batch 16, 38,400 iterations.
+**Linear softmax classifier.** A single `Linear(768, 1000)` layer with bias applied to the L2-normalized CLS token, 769,000 parameters. Trained as a two-pass job: first the frozen backbone runs over ImageNet-1k train to cache a per-image CLS feature tensor (1,281,167 × 768), then the linear layer trains on the cached features alone with SGD (momentum 0.9, weight decay 0), batch 4096, cosine schedule, 100 epochs, no augmentation. A small LR sweep over {0.5, 1.0, 3.0, 10.0, 30.0} selected lr=30.0: L2-normalized features plus zero-initialized weights require an unusually large learning rate to grow the weight scale to the point where the softmax distribution sharpens. The best run reached 85.53% top-1 and 97.69% top-5 on ImageNet-1k val.
+**Detection (split-tower on a cofiber decomposition of frozen features).** Anchor-free. The multi-scale decomposition is applied per-channel to the 768-D feature map: 2×2 average pool reduce, bilinear upsample expand, band_k = x_k − U(D(x_k)) with x_{k+1} = D(x_k). Zero parameters, replacing an 11M-parameter FPN. Five prediction levels at strides [8, 16, 32, 64, 128]: four bands at 16, 32, 64, 128, plus a stride-8 level from a single transposed convolution on the stride-16 band. The Rocq/HoTT formalization in phanerozoic/cofiber-detection proves that this is a split short exact sequence in any semi-additive category with an adjoint retraction pair (U, D), with each band equal to ker(D_k). Burt & Adelson's 1983 Laplacian pyramid is the scalar-image instance with Gaussian reduce. Separate classification and regression towers of depth nine (five 3×3 ConvGN blocks followed by four depthwise residual blocks at 160 hidden channels) process each level with weights shared across levels. Top-down lateral connections pass information from coarser to finer bands before the towers run. Classification is cosine similarity between a `Linear(160, 768)` projection and CLIP ViT-L/14 multi-prompt text embeddings of the 80 COCO class names, with a learned scalar temperature and per-class bias. Regression uses exponentiated LTRB distances with a learned per-level scale. Centerness is a single 1×1 convolution. 2,975,067 parameters.
+Trained at 768×768 with letterbox padding, ATSS target assignment (Zhang et al. 2020), horizontal-flip augmentation, focal loss (α=0.25, γ=2.0) for classification, GIoU for boxes, BCE for centerness, AdamW (lr 5e-4, weight decay 1e-4), cosine schedule with 3% warmup, batch 16, 16 epochs. Step 104,000 of 117,264 was selected by late-training checkpoint sweep as the base. A 3-epoch partial fine-tune at lr 1e-4 then updates only `cls_project`, `cls_bias`, and `logit_scale` (the classification calibration layers), leaving the towers and the decomposition path frozen. The partial fine-tune adds +0.15 aggregate mAP and +1.1 small-object mAP. The shipped weights are the final state of that fine-tune. The standalone detection-head checkpoint is mirrored in the sibling detection-heads repo at [`heads/cofiber_threshold/split_tower_5scale_160h_5std_4dw_ema_l14_16ep_768_cls_calib/checkpoint_final.pth`](https://huggingface.co/phanerozoic/detection-heads/blob/main/heads/cofiber_threshold/split_tower_5scale_160h_5std_4dw_ema_l14_16ep_768_cls_calib/checkpoint_final.pth) with its eval JSON alongside.
+**Correspondence.** No learned parameters. At inference, dense patch features are extracted from both images, upsampled to 512×512 pixel resolution, and matched by cosine similarity per source keypoint.
+### Compute
+| Task                             | Iterations                                                                                          | Notes                                                                                                       |
+|----------------------------------|-----------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------|
+| Segmentation (ADE20K)            | 40,000                                                                                              | linear probe, batch 16, 512px, CE loss, frozen backbone                                                     |
+| Linear classifier (ImageNet-1k)  | 100 epochs × 313 steps                                                                              | SGD momentum 0.9, batch 4096, cosine schedule on cached CLS features; extraction is a single full-train pass through the frozen backbone |
+| DPT depth decoder (NYU Depth V2) | 38,400 iterations                                                                                   | batch 16, 416px, SILog loss, frozen backbone                                                                |
+| Detection (COCO 2017)            | 16 epochs × 7,329 batches at 768px + 3-epoch partial fine-tune of classification calibration layers | bf16 mixed-precision forward + fp32 master params + fp32 AdamW moments, CUDA graph capture, frozen backbone |
+| Correspondence (SPair-71k)       | training-free                                                                                       |                                                                                                             |
+### Why minimal heads
+The segmentation and classification heads follow the EUPE paper's evaluation principle: a minimal decoder isolates the backbone's contribution from the head's capacity. A Mask2Former-style segmentation head would produce higher mIoU, but those numbers would reflect the decoder as much as the features. The depth and detection heads are heavier because their tasks require multi-scale reasoning. The decomposition costs no trained parameters, so the detection head budget stays small (2.98M) while covering five pyramid levels from stride 8 to stride 128.
+## Notes and limitations
+- The segmentation head was trained on ADE20K's 150-class indoor-and-urban label space.
+- The depth head was trained on NYU Depth V2 (indoor). Outdoor metric depth should be treated as approximate.
+- The detection head was trained on COCO 2017's 80-class label space at 768-pixel input. Small-object mAP (22.3) is the weakest axis because the stride-8 P3 level can only resolve objects roughly 10 pixels and larger at that resolution.
+- Correspondence has no confidence signal; it returns a target pixel for every source keypoint regardless of match ambiguity.
+## License
+The EUPE-ViT-B backbone weights inside this checkpoint were released by Meta FAIR under the [FAIR Research License](https://huggingface.co/facebook/EUPE-ViT-B/blob/main/LICENSE), which restricts use to non-commercial research and education. The task heads and class prototypes in this checkpoint were trained independently by the author of this repository and would on their own be releasable under a permissive license. However, because they are inseparably bundled with the backbone weights in a single file, the unified checkpoint inherits the more restrictive license of its most restricted component. In practical terms, both `model.safetensors` and `model.bf16_backbone.safetensors` should be treated as released under the FAIR Research License. See `LICENSE` for the full text.
+## Citation
+```bibtex
+@misc{zhu2026eupe,
+  title={Efficient Universal Perception Encoder},
+  author={Zhu, Chenchen and Suri, Saksham and Jose, Cijo and Oquab, Maxime and Szafraniec, Marc and Wen, Wei and Xiong, Yunyang and Labatut, Patrick and Bojanowski, Piotr and Krishnamoorthi, Raghuraman and Chandra, Vikas},
+  year={2026},
+  eprint={2603.22387},
+  archivePrefix={arXiv},
+  primaryClass={cs.CV}
+}
+```
+## Acknowledgements
+The EUPE backbone was trained and released by Meta FAIR. The dataset loading utilities are from the DINOv3 repository. The Argus task heads, benchmarks, and packaging were done by [phanerozoic](https://huggingface.co/phanerozoic).

argus.py ADDED Viewed

	@@ -0,0 +1,2257 @@

+"""
+Argus: multi-task perception on a single EUPE-ViT-B backbone.
+    from transformers import AutoModel
+    model = AutoModel.from_pretrained("phanerozoic/argus", trust_remote_code=True)
+    result = model.perceive(image)
+The EUPE-ViT-B backbone architecture, all supporting layers, and the Argus
+task heads are inlined below. The backbone code is reproduced from
+facebookresearch/EUPE (Meta FAIR) under the FAIR Research License.
+"""
+import math
+import time
+from functools import partial
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.nn.init
+from PIL import Image
+from torch import Tensor, nn
+from torchvision.ops import nms
+from torchvision.transforms import v2
+from transformers import PretrainedConfig, PreTrainedModel
+# ===========================================================================
+# EUPE backbone — vendored verbatim from facebookresearch/EUPE
+# ===========================================================================
+# ---------- utility helpers (from eupe/utils/utils.py) ---------------------
+def cat_keep_shapes(x_list: List[Tensor]) -> Tuple[Tensor, List[Tuple[int]], List[int]]:
+    shapes = [x.shape for x in x_list]
+    num_tokens = [x.select(dim=-1, index=0).numel() for x in x_list]
+    flattened = torch.cat([x.flatten(0, -2) for x in x_list])
+    return flattened, shapes, num_tokens
+def uncat_with_shapes(flattened: Tensor, shapes: List[Tuple[int]], num_tokens: List[int]) -> List[Tensor]:
+    outputs_splitted = torch.split_with_sizes(flattened, num_tokens, dim=0)
+    shapes_adjusted = [shape[:-1] + torch.Size([flattened.shape[-1]]) for shape in shapes]
+    outputs_reshaped = [o.reshape(shape) for o, shape in zip(outputs_splitted, shapes_adjusted)]
+    return outputs_reshaped
+def named_apply(
+    fn: Callable,
+    module: nn.Module,
+    name: str = "",
+    depth_first: bool = True,
+    include_root: bool = False,
+) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(
+            fn=fn,
+            module=child_module,
+            name=child_name,
+            depth_first=depth_first,
+            include_root=True,
+        )
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+# ---------- RMSNorm (from eupe/layers/rms_norm.py) -------------------------
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def reset_parameters(self) -> None:
+        nn.init.constant_(self.weight, 1)
+    def _norm(self, x: Tensor) -> Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: Tensor) -> Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+# ---------- LayerScale (from eupe/layers/layer_scale.py) -------------------
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+        device=None,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(torch.empty(dim, device=device))
+        self.init_values = init_values
+    def reset_parameters(self):
+        nn.init.constant_(self.gamma, self.init_values)
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+# ---------- PatchEmbed (from eupe/layers/patch_embed.py) -------------------
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (image_HW[0] // patch_HW[0], image_HW[1] // patch_HW[1])
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        x = self.proj(x)
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)
+        return x
+    def reset_parameters(self):
+        k = 1 / (self.in_chans * (self.patch_size[0] ** 2))
+        nn.init.uniform_(self.proj.weight, -math.sqrt(k), math.sqrt(k))
+        if self.proj.bias is not None:
+            nn.init.uniform_(self.proj.bias, -math.sqrt(k), math.sqrt(k))
+# ---------- RoPE (from eupe/layers/rope_position_encoding.py) --------------
+class RopePositionEmbedding(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        *,
+        num_heads: int,
+        base: Optional[float] = 100.0,
+        min_period: Optional[float] = None,
+        max_period: Optional[float] = None,
+        normalize_coords: Literal["min", "max", "separate"] = "separate",
+        shift_coords: Optional[float] = None,
+        jitter_coords: Optional[float] = None,
+        rescale_coords: Optional[float] = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        assert embed_dim % (4 * num_heads) == 0
+        both_periods = min_period is not None and max_period is not None
+        if (base is None and not both_periods) or (base is not None and both_periods):
+            raise ValueError("Either `base` or `min_period`+`max_period` must be provided.")
+        D_head = embed_dim // num_heads
+        self.base = base
+        self.min_period = min_period
+        self.max_period = max_period
+        self.D_head = D_head
+        self.normalize_coords = normalize_coords
+        self.shift_coords = shift_coords
+        self.jitter_coords = jitter_coords
+        self.rescale_coords = rescale_coords
+        self.dtype = dtype
+        self.register_buffer(
+            "periods",
+            torch.empty(D_head // 4, device=device, dtype=dtype),
+            persistent=True,
+        )
+        self._init_weights()
+    def forward(self, *, H: int, W: int) -> Tuple[Tensor, Tensor]:
+        device = self.periods.device
+        dtype = self.dtype
+        dd = {"device": device, "dtype": dtype}
+        if self.normalize_coords == "max":
+            max_HW = max(H, W)
+            coords_h = torch.arange(0.5, H, **dd) / max_HW
+            coords_w = torch.arange(0.5, W, **dd) / max_HW
+        elif self.normalize_coords == "min":
+            min_HW = min(H, W)
+            coords_h = torch.arange(0.5, H, **dd) / min_HW
+            coords_w = torch.arange(0.5, W, **dd) / min_HW
+        elif self.normalize_coords == "separate":
+            coords_h = torch.arange(0.5, H, **dd) / H
+            coords_w = torch.arange(0.5, W, **dd) / W
+        else:
+            raise ValueError(f"Unknown normalize_coords: {self.normalize_coords}")
+        coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1)
+        coords = coords.flatten(0, 1)
+        coords = 2.0 * coords - 1.0
+        if self.training and self.shift_coords is not None:
+            shift_hw = torch.empty(2, **dd).uniform_(-self.shift_coords, self.shift_coords)
+            coords += shift_hw[None, :]
+        if self.training and self.jitter_coords is not None:
+            jitter_max = np.log(self.jitter_coords)
+            jitter_min = -jitter_max
+            jitter_hw = torch.empty(2, **dd).uniform_(jitter_min, jitter_max).exp()
+            coords *= jitter_hw[None, :]
+        if self.training and self.rescale_coords is not None:
+            rescale_max = np.log(self.rescale_coords)
+            rescale_min = -rescale_max
+            rescale_hw = torch.empty(1, **dd).uniform_(rescale_min, rescale_max).exp()
+            coords *= rescale_hw
+        angles = 2 * math.pi * coords[:, :, None] / self.periods[None, None, :]
+        angles = angles.flatten(1, 2)
+        angles = angles.tile(2)
+        cos = torch.cos(angles)
+        sin = torch.sin(angles)
+        return (sin, cos)
+    def _init_weights(self):
+        device = self.periods.device
+        dtype = self.dtype
+        if self.base is not None:
+            periods = self.base ** (
+                2 * torch.arange(self.D_head // 4, device=device, dtype=dtype) / (self.D_head // 2)
+            )
+        else:
+            base = self.max_period / self.min_period
+            exponents = torch.linspace(0, 1, self.D_head // 4, device=device, dtype=dtype)
+            periods = base ** exponents
+            periods = periods / base
+            periods = periods * self.max_period
+        self.periods.data = periods
+# ---------- FFN layers (from eupe/layers/ffn_layers.py) --------------------
+class ListForwardMixin(object):
+    def forward(self, x: Tensor):
+        raise NotImplementedError
+    def forward_list(self, x_list: List[Tensor]) -> List[Tensor]:
+        x_flat, shapes, num_tokens = cat_keep_shapes(x_list)
+        x_flat = self.forward(x_flat)
+        return uncat_with_shapes(x_flat, shapes, num_tokens)
+class Mlp(nn.Module, ListForwardMixin):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+        device=None,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias, device=device)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias, device=device)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class SwiGLUFFN(nn.Module, ListForwardMixin):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Optional[Callable[..., nn.Module]] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+        align_to: int = 8,
+        device=None,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        d = int(hidden_features * 2 / 3)
+        swiglu_hidden_features = d + (-d % align_to)
+        self.w1 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device)
+        self.w2 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device)
+        self.w3 = nn.Linear(swiglu_hidden_features, out_features, bias=bias, device=device)
+    def forward(self, x: Tensor) -> Tensor:
+        x1 = self.w1(x)
+        x2 = self.w2(x)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+# ---------- Attention (from eupe/layers/attention.py) ----------------------
+def rope_rotate_half(x: Tensor) -> Tensor:
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat([-x2, x1], dim=-1)
+def rope_apply(x: Tensor, sin: Tensor, cos: Tensor) -> Tensor:
+    return (x * cos) + (rope_rotate_half(x) * sin)
+class LinearKMaskedBias(nn.Linear):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        o = self.out_features
+        assert o % 3 == 0
+        if self.bias is not None:
+            self.register_buffer("bias_mask", torch.full_like(self.bias, fill_value=math.nan))
+    def forward(self, input: Tensor) -> Tensor:
+        masked_bias = self.bias * self.bias_mask.to(self.bias.dtype) if self.bias is not None else None
+        return F.linear(input, self.weight, masked_bias)
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        mask_k_bias: bool = False,
+        device=None,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        linear_class = LinearKMaskedBias if mask_k_bias else nn.Linear
+        self.qkv = linear_class(dim, dim * 3, bias=qkv_bias, device=device)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias, device=device)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def apply_rope(self, q: Tensor, k: Tensor, rope) -> Tuple[Tensor, Tensor]:
+        q_dtype = q.dtype
+        k_dtype = k.dtype
+        sin, cos = rope
+        rope_dtype = sin.dtype
+        q = q.to(dtype=rope_dtype)
+        k = k.to(dtype=rope_dtype)
+        N = q.shape[-2]
+        prefix = N - sin.shape[-2]
+        assert prefix >= 0
+        q_prefix = q[:, :, :prefix, :]
+        q = rope_apply(q[:, :, prefix:, :], sin, cos)
+        q = torch.cat((q_prefix, q), dim=-2)
+        k_prefix = k[:, :, :prefix, :]
+        k = rope_apply(k[:, :, prefix:, :], sin, cos)
+        k = torch.cat((k_prefix, k), dim=-2)
+        q = q.to(dtype=q_dtype)
+        k = k.to(dtype=k_dtype)
+        return q, k
+    def forward(self, x: Tensor, attn_bias=None, rope=None) -> Tensor:
+        qkv = self.qkv(x)
+        attn_v = self.compute_attention(qkv=qkv, attn_bias=attn_bias, rope=rope)
+        x = self.proj(attn_v)
+        x = self.proj_drop(x)
+        return x
+    def forward_list(self, x_list, attn_bias=None, rope_list=None) -> List[Tensor]:
+        assert len(x_list) == len(rope_list)
+        x_flat, shapes, num_tokens = cat_keep_shapes(x_list)
+        qkv_flat = self.qkv(x_flat)
+        qkv_list = uncat_with_shapes(qkv_flat, shapes, num_tokens)
+        att_out = []
+        for _, (qkv, _, rope) in enumerate(zip(qkv_list, shapes, rope_list)):
+            att_out.append(self.compute_attention(qkv, attn_bias=attn_bias, rope=rope))
+        x_flat, shapes, num_tokens = cat_keep_shapes(att_out)
+        x_flat = self.proj(x_flat)
+        return uncat_with_shapes(x_flat, shapes, num_tokens)
+    def compute_attention(self, qkv: Tensor, attn_bias=None, rope=None) -> Tensor:
+        assert attn_bias is None
+        B, N, _ = qkv.shape
+        C = self.qkv.in_features
+        qkv = qkv.reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = torch.unbind(qkv, 2)
+        q, k, v = [t.transpose(1, 2) for t in [q, k, v]]
+        if rope is not None:
+            q, k = self.apply_rope(q, k, rope)
+        x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+        x = x.transpose(1, 2)
+        return x.reshape([B, N, C])
+# ---------- Block (from eupe/layers/block.py) ------------------------------
+class SelfAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        ffn_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = SelfAttention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        mask_k_bias: bool = False,
+        device=None,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            mask_k_bias=mask_k_bias,
+            device=device,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values, device=device) if init_values else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * ffn_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+            device=device,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values, device=device) if init_values else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    @staticmethod
+    def _maybe_index_rope(rope, indices: Tensor):
+        if rope is None:
+            return None
+        sin, cos = rope
+        assert sin.ndim == cos.ndim
+        if sin.ndim == 4:
+            return sin[indices], cos[indices]
+        return sin, cos
+    def _forward_list(self, x_list: List[Tensor], rope_list=None) -> List[Tensor]:
+        b_list = [x.shape[0] for x in x_list]
+        sample_subset_sizes = [max(int(b * (1 - self.sample_drop_ratio)), 1) for b in b_list]
+        if self.training and self.sample_drop_ratio > 0.0:
+            residual_scale_factors = [b / s for b, s in zip(b_list, sample_subset_sizes)]
+            indices_1_list = [
+                torch.randperm(b, device=x.device)[:s]
+                for x, b, s in zip(x_list, b_list, sample_subset_sizes)
+            ]
+            x_subset_1_list = [x[i] for x, i in zip(x_list, indices_1_list)]
+            if rope_list is not None:
+                rope_subset_list = [
+                    self._maybe_index_rope(r, i) for r, i in zip(rope_list, indices_1_list)
+                ]
+            else:
+                rope_subset_list = rope_list
+            flattened, shapes, num_tokens = cat_keep_shapes(x_subset_1_list)
+            norm1 = uncat_with_shapes(self.norm1(flattened), shapes, num_tokens)
+            residual_1_list = self.attn.forward_list(norm1, rope_list=rope_subset_list)
+            x_attn_list = [
+                torch.index_add(x, dim=0, source=self.ls1(r1), index=i1, alpha=rsf)
+                for x, r1, i1, rsf in zip(x_list, residual_1_list, indices_1_list, residual_scale_factors)
+            ]
+            indices_2_list = [
+                torch.randperm(b, device=x.device)[:s]
+                for x, b, s in zip(x_list, b_list, sample_subset_sizes)
+            ]
+            x_subset_2_list = [x[i] for x, i in zip(x_attn_list, indices_2_list)]
+            flattened, shapes, num_tokens = cat_keep_shapes(x_subset_2_list)
+            norm2_list = uncat_with_shapes(self.norm2(flattened), shapes, num_tokens)
+            residual_2_list = self.mlp.forward_list(norm2_list)
+            x_ffn = [
+                torch.index_add(xa, dim=0, source=self.ls2(r2), index=i2, alpha=rsf)
+                for xa, r2, i2, rsf in zip(x_attn_list, residual_2_list, indices_2_list, residual_scale_factors)
+            ]
+        else:
+            x_out = []
+            for x, rope in zip(x_list, rope_list):
+                x_attn = x + self.ls1(self.attn(self.norm1(x), rope=rope))
+                x_ffn = x_attn + self.ls2(self.mlp(self.norm2(x_attn)))
+                x_out.append(x_ffn)
+            x_ffn = x_out
+        return x_ffn
+    def forward(self, x_or_x_list, rope_or_rope_list=None) -> List[Tensor]:
+        if isinstance(x_or_x_list, Tensor):
+            return self._forward_list([x_or_x_list], rope_list=[rope_or_rope_list])[0]
+        elif isinstance(x_or_x_list, list):
+            if rope_or_rope_list is None:
+                rope_or_rope_list = [None for _ in x_or_x_list]
+            return self._forward_list(x_or_x_list, rope_list=rope_or_rope_list)
+        raise AssertionError
+# ---------- DinoVisionTransformer (from eupe/models/vision_transformer.py)
+ffn_layer_dict = {
+    "mlp": Mlp,
+    "swiglu": SwiGLUFFN,
+    "swiglu32": partial(SwiGLUFFN, align_to=32),
+    "swiglu64": partial(SwiGLUFFN, align_to=64),
+    "swiglu128": partial(SwiGLUFFN, align_to=128),
+}
+norm_layer_dict = {
+    "layernorm": partial(nn.LayerNorm, eps=1e-6),
+    "layernormbf16": partial(nn.LayerNorm, eps=1e-5),
+    "rmsnorm": RMSNorm,
+}
+dtype_dict = {
+    "fp32": torch.float32,
+    "fp16": torch.float16,
+    "bf16": torch.bfloat16,
+}
+def init_weights_vit(module: nn.Module, name: str = ""):
+    if isinstance(module, nn.Linear):
+        torch.nn.init.trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+        if hasattr(module, "bias_mask") and module.bias_mask is not None:
+            o = module.out_features
+            module.bias_mask.fill_(1)
+            module.bias_mask[o // 3 : 2 * o // 3].fill_(0)
+    if isinstance(module, nn.LayerNorm):
+        module.reset_parameters()
+    if isinstance(module, LayerScale):
+        module.reset_parameters()
+    if isinstance(module, PatchEmbed):
+        module.reset_parameters()
+    if isinstance(module, RMSNorm):
+        module.reset_parameters()
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        *,
+        img_size: int = 224,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        pos_embed_rope_base: float = 100.0,
+        pos_embed_rope_min_period: Optional[float] = None,
+        pos_embed_rope_max_period: Optional[float] = None,
+        pos_embed_rope_normalize_coords: Literal["min", "max", "separate"] = "separate",
+        pos_embed_rope_shift_coords: Optional[float] = None,
+        pos_embed_rope_jitter_coords: Optional[float] = None,
+        pos_embed_rope_rescale_coords: Optional[float] = None,
+        pos_embed_rope_dtype: str = "bf16",
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        ffn_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        drop_path_rate: float = 0.0,
+        layerscale_init: Optional[float] = None,
+        norm_layer: str = "layernorm",
+        ffn_layer: str = "mlp",
+        ffn_bias: bool = True,
+        proj_bias: bool = True,
+        n_storage_tokens: int = 0,
+        mask_k_bias: bool = False,
+        untie_cls_and_patch_norms: bool = False,
+        untie_global_and_local_cls_norm: bool = False,
+        device: Any = None,
+        **ignored_kwargs,
+    ):
+        super().__init__()
+        del ignored_kwargs
+        norm_layer_cls = norm_layer_dict[norm_layer]
+        self.num_features = self.embed_dim = embed_dim
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            flatten_embedding=False,
+        )
+        self.cls_token = nn.Parameter(torch.empty(1, 1, embed_dim, device=device))
+        self.n_storage_tokens = n_storage_tokens
+        if self.n_storage_tokens > 0:
+            self.storage_tokens = nn.Parameter(torch.empty(1, n_storage_tokens, embed_dim, device=device))
+        self.rope_embed = RopePositionEmbedding(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            base=pos_embed_rope_base,
+            min_period=pos_embed_rope_min_period,
+            max_period=pos_embed_rope_max_period,
+            normalize_coords=pos_embed_rope_normalize_coords,
+            shift_coords=pos_embed_rope_shift_coords,
+            jitter_coords=pos_embed_rope_jitter_coords,
+            rescale_coords=pos_embed_rope_rescale_coords,
+            dtype=dtype_dict[pos_embed_rope_dtype],
+            device=device,
+        )
+        ffn_layer_cls = ffn_layer_dict[ffn_layer]
+        ffn_ratio_sequence = [ffn_ratio] * depth
+        blocks_list = [
+            SelfAttentionBlock(
+                dim=embed_dim,
+                num_heads=num_heads,
+                ffn_ratio=ffn_ratio_sequence[i],
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=drop_path_rate,
+                norm_layer=norm_layer_cls,
+                act_layer=nn.GELU,
+                ffn_layer=ffn_layer_cls,
+                init_values=layerscale_init,
+                mask_k_bias=mask_k_bias,
+                device=device,
+            )
+            for i in range(depth)
+        ]
+        self.chunked_blocks = False
+        self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer_cls(embed_dim)
+        self.untie_cls_and_patch_norms = untie_cls_and_patch_norms
+        self.cls_norm = norm_layer_cls(embed_dim) if untie_cls_and_patch_norms else None
+        self.untie_global_and_local_cls_norm = untie_global_and_local_cls_norm
+        self.local_cls_norm = norm_layer_cls(embed_dim) if untie_global_and_local_cls_norm else None
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.empty(1, embed_dim, device=device))
+    def init_weights(self):
+        self.rope_embed._init_weights()
+        nn.init.normal_(self.cls_token, std=0.02)
+        if self.n_storage_tokens > 0:
+            nn.init.normal_(self.storage_tokens, std=0.02)
+        nn.init.zeros_(self.mask_token)
+        named_apply(init_weights_vit, self)
+    def prepare_tokens_with_masks(self, x: Tensor, masks=None) -> Tuple[Tensor, Tuple[int, int]]:
+        x = self.patch_embed(x)
+        B, H, W, _ = x.shape
+        x = x.flatten(1, 2)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+            cls_token = self.cls_token
+        else:
+            cls_token = self.cls_token + 0 * self.mask_token
+        if self.n_storage_tokens > 0:
+            storage_tokens = self.storage_tokens
+        else:
+            storage_tokens = torch.empty(
+                1, 0, cls_token.shape[-1],
+                dtype=cls_token.dtype, device=cls_token.device,
+            )
+        x = torch.cat(
+            [cls_token.expand(B, -1, -1), storage_tokens.expand(B, -1, -1), x],
+            dim=1,
+        )
+        return x, (H, W)
+    def forward_features_list(self, x_list: List[Tensor], masks_list: List[Tensor]) -> List[Dict[str, Tensor]]:
+        x = []
+        rope = []
+        for t_x, t_masks in zip(x_list, masks_list):
+            t2_x, hw_tuple = self.prepare_tokens_with_masks(t_x, t_masks)
+            x.append(t2_x)
+            rope.append(hw_tuple)
+        for blk in self.blocks:
+            if self.rope_embed is not None:
+                rope_sincos = [self.rope_embed(H=H, W=W) for H, W in rope]
+            else:
+                rope_sincos = [None for _ in rope]
+            x = blk(x, rope_sincos)
+        all_x = x
+        output = []
+        for idx, (x, masks) in enumerate(zip(all_x, masks_list)):
+            if self.untie_cls_and_patch_norms or self.untie_global_and_local_cls_norm:
+                if self.untie_global_and_local_cls_norm and self.training and idx == 1:
+                    x_norm_cls_reg = self.local_cls_norm(x[:, : self.n_storage_tokens + 1])
+                elif self.untie_cls_and_patch_norms:
+                    x_norm_cls_reg = self.cls_norm(x[:, : self.n_storage_tokens + 1])
+                else:
+                    x_norm_cls_reg = self.norm(x[:, : self.n_storage_tokens + 1])
+                x_norm_patch = self.norm(x[:, self.n_storage_tokens + 1 :])
+            else:
+                x_norm = self.norm(x)
+                x_norm_cls_reg = x_norm[:, : self.n_storage_tokens + 1]
+                x_norm_patch = x_norm[:, self.n_storage_tokens + 1 :]
+            output.append({
+                "x_norm_clstoken": x_norm_cls_reg[:, 0],
+                "x_storage_tokens": x_norm_cls_reg[:, 1:],
+                "x_norm_patchtokens": x_norm_patch,
+                "x_prenorm": x,
+                "masks": masks,
+            })
+        return output
+    def forward_features(self, x, masks: Optional[Tensor] = None):
+        if isinstance(x, torch.Tensor):
+            return self.forward_features_list([x], [masks])[0]
+        return self.forward_features_list(x, masks)
+    def forward(self, *args, is_training: bool = False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        return self.head(ret["x_norm_clstoken"])
+def build_eupe_vitb16() -> DinoVisionTransformer:
+    # qkv_bias=False, mask_k_bias=False: the upstream EUPE-ViT-B release shipped
+    # with `qkv.bias_mask` filled with zeros, which makes the effective qkv bias
+    # zero at every block (masked_bias = bias * 0 = 0). We drop the bias parameter
+    # entirely here — the computation is bitwise-equivalent in fp32, bf16 output
+    # drift is sub-ULP and absorbed by every head except DPT depth (where it
+    # appears as ~2cm noise against a 39cm RMSE, i.e. below the head's own floor).
+    return DinoVisionTransformer(
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        pos_embed_rope_base=100,
+        pos_embed_rope_normalize_coords="separate",
+        pos_embed_rope_rescale_coords=2,
+        pos_embed_rope_dtype="fp32",
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ffn_ratio=4,
+        qkv_bias=False,
+        drop_path_rate=0.0,
+        layerscale_init=1.0e-05,
+        norm_layer="layernormbf16",
+        ffn_layer="mlp",
+        ffn_bias=True,
+        proj_bias=True,
+        n_storage_tokens=4,
+        mask_k_bias=False,
+    )
+# ===========================================================================
+# Argus task heads
+# ===========================================================================
+def make_eupe_transform(resize_size: int):
+    return v2.Compose([
+        v2.ToImage(),
+        v2.Resize((resize_size, resize_size), antialias=True),
+        v2.ToDtype(torch.float32, scale=True),
+        v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
+    ])
+def _normalize_image_input(image_or_images) -> Tuple[bool, list]:
+    """Returns (was_single, [images]). Accepts a PIL.Image or an iterable of them."""
+    if isinstance(image_or_images, Image.Image):
+        return True, [image_or_images]
+    images = list(image_or_images)
+    if not images:
+        raise ValueError("empty image list")
+    for i, img in enumerate(images):
+        if not isinstance(img, Image.Image):
+            raise TypeError(f"images[{i}] is {type(img).__name__}, expected PIL.Image")
+    return False, images
+class _BackboneExportWrapper(nn.Module):
+    """ONNX-friendly wrapper: returns (cls, spatial) instead of a dict."""
+    def __init__(self, backbone: nn.Module):
+        super().__init__()
+        self.backbone = backbone
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        out = self.backbone.forward_features(x)
+        cls = out["x_norm_clstoken"]
+        patches = out["x_norm_patchtokens"]
+        B, N, D = patches.shape
+        h = w = int(N ** 0.5)
+        spatial = patches.permute(0, 2, 1).reshape(B, D, h, w)
+        return cls, spatial
+class _SegHeadExportWrapper(nn.Module):
+    """ONNX-friendly wrapper: seg head + bilinear upsample to input resolution.
+    The bare seg head emits stride-16 logits (e.g. [B, 150, 40, 40] at 640px
+    input). model.segment() upsamples those to the input resolution before
+    argmax. This wrapper folds the upsample into the graph so the ONNX seg
+    output is already at input resolution — consumers argmax directly without
+    a separate interpolation step.
+    """
+    def __init__(self, seg_head: nn.Module, resolution: int):
+        super().__init__()
+        self.seg_head = seg_head
+        self.resolution = resolution
+    def forward(self, spatial_features: Tensor) -> Tensor:
+        logits = self.seg_head(spatial_features)
+        return F.interpolate(logits, size=(self.resolution, self.resolution),
+                             mode="bilinear", align_corners=False)
+class _DepthHeadExportWrapper(nn.Module):
+    """ONNX-friendly wrapper for the DPT depth head.
+    DPTDepthDecoder.forward takes (intermediates: List[Tensor], H: int, W: int),
+    which torch.onnx.export cannot trace cleanly because the List contains four
+    tensors and H/W are Python ints. The wrapper accepts the four intermediate
+    ViT-block activations as separate positional tensor inputs and forwards them
+    to the underlying decoder with the captured H and W.
+    """
+    def __init__(self, depth_head: nn.Module, H: int, W: int):
+        super().__init__()
+        self.depth_head = depth_head
+        self.H = H
+        self.W = W
+    def forward(self, inter0: Tensor, inter1: Tensor, inter2: Tensor, inter3: Tensor) -> Tensor:
+        return self.depth_head([inter0, inter1, inter2, inter3], self.H, self.W)
+class _ClassifierExportWrapper(nn.Module):
+    """ONNX-friendly wrapper for the ImageNet linear-softmax classifier.
+    Takes the backbone's CLS token, L2-normalizes, applies the stored
+    Linear(embed_dim, 1000) weight + bias, and returns a softmax
+    distribution over the 1000 ImageNet classes. The weight and bias are
+    captured as buffers so the graph is self-contained — no separate
+    weight file needed for classification inference.
+    """
+    def __init__(self, class_weight: Tensor, class_bias: Tensor):
+        super().__init__()
+        self.register_buffer("weight", class_weight.float().clone())
+        self.register_buffer("bias", class_bias.float().clone())
+    def forward(self, cls_token: Tensor) -> Tensor:
+        x = F.normalize(cls_token, dim=-1)
+        logits = F.linear(x, self.weight, self.bias)
+        return F.softmax(logits, dim=-1)
+class _ONNXBatchedNMS(torch.autograd.Function):
+    """Autograd wrapper that exports to ONNX NonMaxSuppression (opset >= 10).
+    ONNX's NonMaxSuppression handles batched multi-class NMS natively:
+      boxes  [B, N, 4]   in [y1, x1, y2, x2] order (center_point_box=0)
+      scores [B, C, N]
+      -> selected_indices [M, 3] where each row is [batch, class, box]
+    The eager forward path reproduces this via torchvision.ops.nms so
+    PyTorch tracing and verify=True both work without calling into
+    ORT for the reference.
+    """
+    @staticmethod
+    def symbolic(g, boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold):
+        return g.op(
+            "NonMaxSuppression",
+            boxes, scores,
+            max_output_boxes_per_class,
+            iou_threshold,
+            score_threshold,
+            center_point_box_i=0,
+        )
+    @staticmethod
+    def forward(ctx, boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold):
+        from torchvision.ops import nms as tv_nms
+        B, N, _ = boxes.shape
+        _, C, _ = scores.shape
+        max_out = int(max_output_boxes_per_class.item())
+        iou_thr = float(iou_threshold.item())
+        score_thr = float(score_threshold.item())
+        results: List[List[int]] = []
+        for b in range(B):
+            for c in range(C):
+                sc = scores[b, c]
+                mask = sc > score_thr
+                if not mask.any():
+                    continue
+                idx = mask.nonzero(as_tuple=True)[0]
+                # tv_nms expects [x1, y1, x2, y2]; our boxes are [y1, x1, y2, x2].
+                bx_xyxy = boxes[b, idx][:, [1, 0, 3, 2]]
+                keep = tv_nms(bx_xyxy, sc[idx], iou_thr)[:max_out]
+                for k in keep.tolist():
+                    results.append([b, c, int(idx[k].item())])
+        if not results:
+            return torch.zeros((0, 3), dtype=torch.long, device=boxes.device)
+        return torch.tensor(results, dtype=torch.long, device=boxes.device)
+class _DetectionHeadExportWrapper(nn.Module):
+    """ONNX-friendly wrapper for the detection head (simple FPN + FCOS).
+    Takes backbone stride-16 spatial features and returns decoded
+    per-location predictions concatenated across all five FPN levels.
+    Without NMS (default):
+      - boxes  [B, N_total, 4]   xyxy in input-resolution pixels,
+                                 decoded as (location - exp(reg)) /
+                                 (location + exp(reg)) and clamped.
+      - scores [B, N_total, num_classes]
+                                 sigmoid(cls_logits) * sigmoid(centerness).
+    With NMS (include_nms=True):
+      - boxes        [M, 4]   xyxy in input-resolution pixels
+      - scores       [M]
+      - class_labels [M]      int64 class index
+      - batch_indices[M]      int64 batch index
+    N_total = sum(H_i * W_i) across strides [8, 16, 32, 64, 128]. At
+    640px input: 6400 + 1600 + 400 + 100 + 25 = 8525 locations/image.
+    The NMS variant folds ONNX's NonMaxSuppression (opset >= 10) into
+    the graph using the configured iou / score / max_detections
+    parameters, producing a flat list of surviving detections across
+    all batches and classes. Useful for single-shot TensorRT / mobile
+    inference. Without NMS the consumer runs their own — hard vs soft,
+    per-class vs global, threshold tuning — without re-exporting.
+    """
+    def __init__(self, detection_head: nn.Module, resolution: int,
+                 include_nms: bool = False,
+                 nms_iou_threshold: float = 0.5,
+                 nms_score_threshold: float = 0.05,
+                 nms_max_detections: int = 100):
+        super().__init__()
+        self.detection_head = detection_head
+        self.resolution = resolution
+        self.num_classes = detection_head.num_classes
+        self.include_nms = include_nms
+        self.nms_iou_threshold = nms_iou_threshold
+        self.nms_score_threshold = nms_score_threshold
+        self.nms_max_detections = nms_max_detections
+        # Compute per-level spatial sizes from the SimpleFeaturePyramid's actual
+        # output shapes, not from resolution // stride. The pyramid starts at
+        # stride-16 backbone features (H = resolution // 16) and produces:
+        #   P3 = 2*H         via ConvTranspose2d(stride=2)
+        #   P4 = H           via 1x1 + 3x3 convs (no stride)
+        #   P5 = (H+1)//2    via Conv2d(3x3, stride=2, padding=1)
+        #   P6 = (P5+1)//2   via Conv2d on P5
+        #   P7 = (P6+1)//2   via Conv2d on P6
+        # When resolution is a multiple of 128, these match resolution // stride
+        # exactly; at other resolutions the stride-2 convs round up via the
+        # padding=1 kernel=3 formula, so P6/P7 are slightly larger than
+        # nominal stride division suggests. Feature-pyramid-level locations
+        # still use the nominal FPN_STRIDES for FCOS box decoding because
+        # that's what eager `model.detect` does.
+        H = resolution // 16
+        p3 = 2 * H
+        p4 = H
+        p5 = (H + 1) // 2
+        p6 = (p5 + 1) // 2
+        p7 = (p6 + 1) // 2
+        feat_sizes = [(p3, p3), (p4, p4), (p5, p5), (p6, p6), (p7, p7)]
+        locs_per_level = []
+        for (h, w), s in zip(feat_sizes, FPN_STRIDES):
+            ys = (torch.arange(h, dtype=torch.float32) + 0.5) * s
+            xs = (torch.arange(w, dtype=torch.float32) + 0.5) * s
+            gy, gx = torch.meshgrid(ys, xs, indexing="ij")
+            locs_per_level.append(torch.stack([gx.flatten(), gy.flatten()], -1))
+        all_locs = torch.cat(locs_per_level, 0)
+        self.register_buffer("all_locs", all_locs)
+    def forward(self, spatial_features: Tensor):
+        cls_logits, box_regs, centernesses = self.detection_head(spatial_features)
+        B = spatial_features.shape[0]
+        flat_cls = torch.cat(
+            [c.permute(0, 2, 3, 1).reshape(B, -1, self.num_classes) for c in cls_logits], dim=1)
+        flat_reg = torch.cat(
+            [r.permute(0, 2, 3, 1).reshape(B, -1, 4) for r in box_regs], dim=1)
+        flat_ctr = torch.cat(
+            [c.permute(0, 2, 3, 1).reshape(B, -1, 1) for c in centernesses], dim=1)
+        scores = torch.sigmoid(flat_cls) * torch.sigmoid(flat_ctr)
+        locs = self.all_locs.unsqueeze(0).expand(B, -1, -1)
+        x1 = (locs[..., 0:1] - flat_reg[..., 0:1]).clamp(0, self.resolution)
+        y1 = (locs[..., 1:2] - flat_reg[..., 1:2]).clamp(0, self.resolution)
+        x2 = (locs[..., 0:1] + flat_reg[..., 2:3]).clamp(0, self.resolution)
+        y2 = (locs[..., 1:2] + flat_reg[..., 3:4]).clamp(0, self.resolution)
+        boxes = torch.cat([x1, y1, x2, y2], dim=-1)
+        if not self.include_nms:
+            return boxes, scores
+        # ONNX NMS expects boxes in [y1, x1, y2, x2] (center_point_box=0) and
+        # scores with the class dim in the middle: [B, C, N].
+        boxes_yxyx = torch.cat([y1, x1, y2, x2], dim=-1)
+        scores_bcn = scores.permute(0, 2, 1).contiguous()
+        max_out = torch.tensor(self.nms_max_detections, dtype=torch.long, device=boxes.device)
+        iou_thr = torch.tensor(self.nms_iou_threshold, dtype=torch.float32, device=boxes.device)
+        score_thr = torch.tensor(self.nms_score_threshold, dtype=torch.float32, device=boxes.device)
+        selected = _ONNXBatchedNMS.apply(
+            boxes_yxyx, scores_bcn, max_out, iou_thr, score_thr,
+        )
+        batch_idx = selected[:, 0].long()
+        class_idx = selected[:, 1].long()
+        box_idx = selected[:, 2].long()
+        sel_boxes = boxes[batch_idx, box_idx]                  # [M, 4] xyxy
+        sel_scores = scores[batch_idx, box_idx, class_idx]     # [M]
+        return sel_boxes, sel_scores, class_idx, batch_idx
+class SegmentationHead(nn.Module):
+    def __init__(self, in_dim: int = 768, num_classes: int = 150):
+        super().__init__()
+        self.batchnorm_layer = nn.BatchNorm2d(in_dim)
+        self.conv = nn.Conv2d(in_dim, num_classes, kernel_size=1)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.conv(self.batchnorm_layer(x))
+class DepthHead(nn.Module):
+    def __init__(self, in_dim: int = 768, n_bins: int = 256,
+                 min_depth: float = 0.001, max_depth: float = 10.0):
+        super().__init__()
+        self.batchnorm_layer = nn.BatchNorm2d(in_dim)
+        self.conv_depth = nn.Conv2d(in_dim, n_bins, kernel_size=1)
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.n_bins = n_bins
+    def forward(self, x: Tensor) -> Tensor:
+        logits = self.conv_depth(self.batchnorm_layer(x))
+        logit = torch.relu(logits) + 0.1
+        logit = logit / logit.sum(dim=1, keepdim=True)
+        bins = torch.linspace(self.min_depth, self.max_depth, self.n_bins, device=x.device)
+        return torch.einsum("bkhw,k->bhw", logit, bins).unsqueeze(1)
+# ===========================================================================
+# Detection (FCOS with ViTDet-style simple feature pyramid)
+# ===========================================================================
+FPN_STRIDES = [8, 16, 32, 64, 128]
+COCO_CLASSES = [
+    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck",
+    "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
+    "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra",
+    "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+    "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
+    "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
+    "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
+    "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+    "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
+    "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
+    "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
+    "toothbrush",
+]
+def cofiber_decompose(f: Tensor, n_scales: int) -> List[Tensor]:
+    """Iterated multi-scale decomposition. Each step subtracts the
+    downsampled-then-upsampled component of the current residual and
+    recurses on the remainder. Zero learned parameters. The final entry is
+    the lowest-frequency remainder."""
+    cofibers: List[Tensor] = []
+    residual = f
+    for _ in range(n_scales - 1):
+        omega = F.avg_pool2d(residual, 2)
+        sigma_omega = F.interpolate(omega, size=residual.shape[2:],
+                                    mode="bilinear", align_corners=False)
+        cofibers.append(residual - sigma_omega)
+        residual = omega
+    cofibers.append(residual)
+    return cofibers
+def make_sin_pos_emb(H: int, W: int, dim: int, device) -> Tensor:
+    """2D sinusoidal positional encoding over an H x W grid. Concatenated
+    to the backbone patch features before the head stem."""
+    assert dim % 4 == 0, "pos emb dim must be divisible by 4"
+    d = dim // 4
+    ys = torch.arange(H, device=device, dtype=torch.float32)
+    xs = torch.arange(W, device=device, dtype=torch.float32)
+    omega = torch.exp(torch.arange(d, device=device, dtype=torch.float32)
+                      * -(math.log(10000.0) / d))
+    pe_y = torch.zeros(H, d * 2, device=device)
+    pe_y[:, 0::2] = torch.sin(ys[:, None] * omega[None, :])
+    pe_y[:, 1::2] = torch.cos(ys[:, None] * omega[None, :])
+    pe_x = torch.zeros(W, d * 2, device=device)
+    pe_x[:, 0::2] = torch.sin(xs[:, None] * omega[None, :])
+    pe_x[:, 1::2] = torch.cos(xs[:, None] * omega[None, :])
+    pos = torch.zeros(dim, H, W, device=device)
+    pos[:d * 2] = pe_y.permute(1, 0)[:, :, None].expand(-1, H, W)
+    pos[d * 2:] = pe_x.permute(1, 0)[None, :, :].expand(H, -1, W).permute(1, 0, 2)
+    return pos.unsqueeze(0)
+class ConvGNBlock(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(channels, channels, 3, padding=1)
+        self.norm = nn.GroupNorm(min(32, channels), channels)
+        self.act = nn.GELU()
+    def forward(self, x: Tensor) -> Tensor:
+        return self.act(self.norm(self.conv(x)))
+class DWResBlock(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.pw = nn.Conv2d(channels, channels, 1)
+        self.act = nn.GELU()
+        self.dw = nn.Conv2d(channels, channels, 3, padding=1, groups=channels)
+        self.norm = nn.GroupNorm(min(32, channels), channels)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.norm(self.dw(self.act(self.pw(x))))
+def make_tower(hidden: int, n_std: int, n_dw: int) -> nn.Sequential:
+    layers: List[nn.Module] = [ConvGNBlock(hidden) for _ in range(n_std)]
+    layers += [DWResBlock(hidden) for _ in range(n_dw)]
+    return nn.Sequential(*layers)
+class SplitTowerHead(nn.Module):
+    """Detection head operating on a cofiber decomposition of the frozen
+    backbone features. Five prediction levels (strides 8, 16, 32, 64, 128):
+    a stride-8 level synthesized by a transposed convolution from the
+    stride-16 band and four cofiber bands at strides 16, 32, 64, 128.
+    Separate classification and regression towers of depth (n_std_layers +
+    n_dw_layers) with weights shared across levels. Classification via
+    cosine similarity against frozen CLIP text-encoder embeddings of the
+    COCO class names; regression via exponentiated LTRB distances with a
+    learned per-level scale; centerness via a single 1x1 convolution.
+    Inference-only within Argus: no DFL, no IoU-aware branch, no
+    per-scale bias. The text_embed buffer is populated by from_pretrained's
+    state_dict load."""
+    def __init__(self,
+                 feat_dim: int = 768,
+                 hidden: int = 160,
+                 n_std_layers: int = 5,
+                 n_dw_layers: int = 4,
+                 n_scales: int = 4,
+                 pos_emb_dim: int = 64,
+                 num_classes: int = 80,
+                 text_embed_dim: int = 768):
+        super().__init__()
+        self.n_scales = n_scales
+        self.pos_emb_dim = pos_emb_dim
+        self.num_classes = num_classes
+        self.text_embed_dim = text_embed_dim
+        n_total = n_scales + 1
+        input_dim = feat_dim + pos_emb_dim
+        self.scale_norms = nn.ModuleList([nn.GroupNorm(1, input_dim) for _ in range(n_scales)])
+        self.stem = nn.Conv2d(input_dim, hidden, 1)
+        self.stem_act = nn.GELU()
+        self.p3_upsample = nn.ConvTranspose2d(hidden, hidden, 2, stride=2)
+        self.p3_norm = nn.GroupNorm(min(32, hidden), hidden)
+        self.lateral_convs = nn.ModuleList([nn.Conv2d(hidden, hidden, 1) for _ in range(n_scales - 1)])
+        self.lateral_norms = nn.ModuleList(
+            [nn.GroupNorm(min(32, hidden), hidden) for _ in range(n_scales - 1)])
+        self.cls_tower = make_tower(hidden, n_std_layers, n_dw_layers)
+        self.reg_tower = make_tower(hidden, n_std_layers, n_dw_layers)
+        # CLIP text-aligned classifier. The text_embed buffer is filled from
+        # the state dict at from_pretrained; the zero placeholder here only
+        # exists so the module can be constructed before weights arrive.
+        self.register_buffer("text_embed",
+                             torch.zeros(num_classes, text_embed_dim))
+        self.cls_project = nn.Linear(hidden, text_embed_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.tensor(math.log(1.0 / 0.07)))
+        self.cls_bias = nn.Parameter(torch.full((num_classes,), -math.log(99)))
+        self.reg_pred = nn.Conv2d(hidden, 4, 1)
+        self.ctr_pred = nn.Conv2d(hidden, 1, 1)
+        self.scale_params = nn.Parameter(torch.ones(n_total))
+    def forward(self, spatial: Tensor) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]:
+        B, C, H_, W_ = spatial.shape
+        pos = make_sin_pos_emb(H_, W_, self.pos_emb_dim, spatial.device).expand(B, -1, -1, -1)
+        spatial = torch.cat([spatial, pos], dim=1)
+        cofibers = cofiber_decompose(spatial, self.n_scales)
+        scale_features: List[Tensor] = []
+        for i, cof in enumerate(cofibers):
+            x = self.stem_act(self.stem(self.scale_norms[i](cof)))
+            scale_features.append(x)
+        # Top-down lateral fusion from coarser to finer scales.
+        for i in range(len(scale_features) - 2, -1, -1):
+            coarse_up = F.interpolate(scale_features[i + 1],
+                                      size=scale_features[i].shape[2:],
+                                      mode="bilinear", align_corners=False)
+            scale_features[i] = self.lateral_norms[i](
+                scale_features[i] + self.lateral_convs[i](coarse_up))
+        p3 = self.p3_norm(self.p3_upsample(scale_features[0]))
+        all_features = [p3] + scale_features
+        cls_l, reg_l, ctr_l = [], [], []
+        for i, x in enumerate(all_features):
+            cls_feat = self.cls_tower(x)
+            reg_feat = self.reg_tower(x)
+            B_, _, Hi, Wi = cls_feat.shape
+            f = cls_feat.permute(0, 2, 3, 1).reshape(-1, cls_feat.shape[1])
+            f_proj = self.cls_project(f)
+            f_norm = F.normalize(f_proj, p=2, dim=-1)
+            logits = f_norm @ self.text_embed.t()
+            cls = (logits * self.logit_scale.exp() + self.cls_bias).reshape(
+                B_, Hi, Wi, self.num_classes).permute(0, 3, 1, 2)
+            reg_raw = (self.reg_pred(reg_feat) * self.scale_params[i]).clamp(-10, 10)
+            reg = reg_raw.exp()
+            ctr = self.ctr_pred(reg_feat)
+            cls_l.append(cls)
+            reg_l.append(reg)
+            ctr_l.append(ctr)
+        return cls_l, reg_l, ctr_l
+def _make_locations(feature_sizes: List[Tuple[int, int]], strides: List[int], device) -> List[Tensor]:
+    """Per-level center coordinates of feature-map locations in image space."""
+    all_locs = []
+    for (h, w), s in zip(feature_sizes, strides):
+        ys = (torch.arange(h, device=device, dtype=torch.float32) + 0.5) * s
+        xs = (torch.arange(w, device=device, dtype=torch.float32) + 0.5) * s
+        grid_y, grid_x = torch.meshgrid(ys, xs, indexing="ij")
+        locs = torch.stack([grid_x.flatten(), grid_y.flatten()], dim=-1)
+        all_locs.append(locs)
+    return all_locs
+@torch.inference_mode()
+def _decode_detections(
+    cls_logits_per_level: List[Tensor],
+    box_regs_per_level: List[Tensor],
+    centernesses_per_level: List[Tensor],
+    locations_per_level: List[Tensor],
+    image_sizes: List[Tuple[int, int]],
+    score_thresh: float = 0.05,
+    nms_thresh: float = 0.5,
+    max_per_level: int = 1000,
+    max_per_image: int = 100,
+) -> List[Dict[str, Tensor]]:
+    """Convert per-level logits/regs/centerness into per-image detections (xyxy boxes)."""
+    B = cls_logits_per_level[0].shape[0]
+    num_classes = cls_logits_per_level[0].shape[1]
+    device = cls_logits_per_level[0].device
+    per_image_results = []
+    for image_idx in range(B):
+        all_boxes, all_scores, all_labels = [], [], []
+        for cls_l, reg_l, ctr_l, locs_l in zip(
+            cls_logits_per_level, box_regs_per_level, centernesses_per_level, locations_per_level
+        ):
+            cls = cls_l[image_idx].permute(1, 2, 0).reshape(-1, num_classes)
+            reg = reg_l[image_idx].permute(1, 2, 0).reshape(-1, 4)
+            ctr = ctr_l[image_idx].permute(1, 2, 0).reshape(-1)
+            cls_prob = torch.sigmoid(cls)
+            ctr_prob = torch.sigmoid(ctr)
+            scores = cls_prob * ctr_prob[:, None]
+            mask = scores > score_thresh
+            if not mask.any():
+                continue
+            cand_loc, cand_cls = mask.nonzero(as_tuple=True)
+            cand_scores = scores[cand_loc, cand_cls]
+            if cand_scores.numel() > max_per_level:
+                top = cand_scores.topk(max_per_level)
+                cand_scores = top.values
+                idx = top.indices
+                cand_loc = cand_loc[idx]
+                cand_cls = cand_cls[idx]
+            cand_locs_xy = locs_l[cand_loc]
+            cand_reg = reg[cand_loc]
+            boxes = torch.stack([
+                cand_locs_xy[:, 0] - cand_reg[:, 0],
+                cand_locs_xy[:, 1] - cand_reg[:, 1],
+                cand_locs_xy[:, 0] + cand_reg[:, 2],
+                cand_locs_xy[:, 1] + cand_reg[:, 3],
+            ], dim=-1)
+            all_boxes.append(boxes)
+            all_scores.append(cand_scores)
+            all_labels.append(cand_cls)
+        if all_boxes:
+            boxes = torch.cat(all_boxes, dim=0)
+            scores = torch.cat(all_scores, dim=0)
+            labels = torch.cat(all_labels, dim=0)
+            H, W = image_sizes[image_idx]
+            boxes[:, 0::2] = boxes[:, 0::2].clamp(0, W)
+            boxes[:, 1::2] = boxes[:, 1::2].clamp(0, H)
+            keep_all = []
+            for c in labels.unique():
+                cm = labels == c
+                keep = nms(boxes[cm], scores[cm], nms_thresh)
+                keep_idx = cm.nonzero(as_tuple=True)[0][keep]
+                keep_all.append(keep_idx)
+            keep_all = torch.cat(keep_all, dim=0)
+            boxes = boxes[keep_all]
+            scores = scores[keep_all]
+            labels = labels[keep_all]
+            if scores.numel() > max_per_image:
+                top = scores.topk(max_per_image)
+                boxes = boxes[top.indices]
+                scores = top.values
+                labels = labels[top.indices]
+        else:
+            boxes = torch.zeros((0, 4), device=device)
+            scores = torch.zeros((0,), device=device)
+            labels = torch.zeros((0,), dtype=torch.long, device=device)
+        per_image_results.append({"boxes": boxes, "scores": scores, "labels": labels})
+    return per_image_results
+def _letterbox_to_square(image: Image.Image, resolution: int) -> Tuple[Image.Image, float, Tuple[int, int]]:
+    """Resize preserving aspect ratio and pad bottom/right with black. Matches the training transform."""
+    W0, H0 = image.size
+    scale = resolution / max(H0, W0)
+    new_w = int(round(W0 * scale))
+    new_h = int(round(H0 * scale))
+    resized = image.resize((new_w, new_h), Image.BILINEAR)
+    canvas = Image.new("RGB", (resolution, resolution), (0, 0, 0))
+    canvas.paste(resized, (0, 0))
+    return canvas, scale, (W0, H0)
+# ===========================================================================
+# DPT depth decoder (multi-scale, hooks into ViT blocks [2, 5, 8, 11])
+# ===========================================================================
+HOOK_BLOCK_INDICES = [2, 5, 8, 11]
+N_PREFIX_TOKENS = 5  # 1 CLS + 4 register/storage tokens
+class _ResidualConvUnit(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.conv1 = nn.Conv2d(dim, dim, 3, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(dim)
+        self.conv2 = nn.Conv2d(dim, dim, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(dim)
+        self.act = nn.GELU()
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.bn2(self.conv2(self.act(self.bn1(self.conv1(x)))))
+class _FeatureFusionBlock(nn.Module):
+    def __init__(self, dim: int, has_skip: bool = True):
+        super().__init__()
+        self.rcu1 = _ResidualConvUnit(dim)
+        self.rcu2 = _ResidualConvUnit(dim)
+        self.skip_proj = nn.Conv2d(dim, dim, 1) if has_skip else None
+    def forward(self, x: Tensor, skip: Optional[Tensor] = None) -> Tensor:
+        if skip is not None and self.skip_proj is not None:
+            x = x + self.skip_proj(skip)
+        x = self.rcu1(x)
+        x = self.rcu2(x)
+        return F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=False)
+class _DPTReassemble(nn.Module):
+    def __init__(self, in_dim: int = 768, out_dim: int = 256):
+        super().__init__()
+        self.projects = nn.ModuleList([
+            nn.Sequential(nn.LayerNorm(in_dim), nn.Linear(in_dim, out_dim))
+            for _ in range(4)
+        ])
+        self.refine = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(out_dim, out_dim, 3, padding=1, bias=False),
+                nn.BatchNorm2d(out_dim),
+                nn.GELU(),
+            )
+            for _ in range(4)
+        ])
+    def forward(self, intermediates: List[Tensor], H: int, W: int) -> List[Tensor]:
+        out = []
+        for feat, proj, refine in zip(intermediates, self.projects, self.refine):
+            patches = feat[:, N_PREFIX_TOKENS:, :]
+            patches = proj(patches)
+            B, N, D = patches.shape
+            spatial = patches.permute(0, 2, 1).reshape(B, D, H, W)
+            out.append(refine(spatial))
+        level_4 = F.interpolate(out[0], scale_factor=4, mode="bilinear", align_corners=False)
+        level_8 = F.interpolate(out[1], scale_factor=2, mode="bilinear", align_corners=False)
+        level_16 = out[2]
+        level_32 = F.interpolate(out[3], scale_factor=0.5, mode="bilinear", align_corners=False)
+        return [level_4, level_8, level_16, level_32]
+class DPTDepthDecoder(nn.Module):
+    def __init__(self, in_dim: int = 768, decoder_dim: int = 256,
+                 n_bins: int = 256, min_depth: float = 0.001, max_depth: float = 10.0):
+        super().__init__()
+        self.n_bins = n_bins
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.reassemble = _DPTReassemble(in_dim=in_dim, out_dim=decoder_dim)
+        self.fusion_blocks = nn.ModuleList([
+            _FeatureFusionBlock(decoder_dim, has_skip=True),
+            _FeatureFusionBlock(decoder_dim, has_skip=True),
+            _FeatureFusionBlock(decoder_dim, has_skip=True),
+            _FeatureFusionBlock(decoder_dim, has_skip=False),
+        ])
+        self.head = nn.Sequential(
+            nn.Conv2d(decoder_dim, decoder_dim, 3, padding=1, bias=False),
+            nn.BatchNorm2d(decoder_dim),
+            nn.GELU(),
+            nn.Conv2d(decoder_dim, n_bins, 1),
+        )
+    def forward(self, intermediates: List[Tensor], H: int, W: int,
+                return_distribution: bool = False):
+        levels = self.reassemble(intermediates, H, W)
+        x = self.fusion_blocks[3](levels[3])
+        x = self.fusion_blocks[2](x, skip=levels[2])
+        x = self.fusion_blocks[1](x, skip=levels[1])
+        x = self.fusion_blocks[0](x, skip=levels[0])
+        logits = self.head(x)
+        distribution = torch.relu(logits) + 0.1
+        distribution = distribution / distribution.sum(dim=1, keepdim=True)
+        bins = torch.linspace(self.min_depth, self.max_depth, self.n_bins, device=x.device)
+        depth = torch.einsum("bkhw,k->bhw", distribution, bins).unsqueeze(1)
+        if return_distribution:
+            return depth, distribution, bins
+        return depth
+# ===========================================================================
+# Argus model (transformers-compatible)
+# ===========================================================================
+class ArgusConfig(PretrainedConfig):
+    model_type = "argus"
+    def __init__(
+        self,
+        embed_dim: int = 768,
+        patch_size: int = 16,
+        num_seg_classes: int = 150,
+        depth_n_bins: int = 256,
+        depth_min_depth: float = 0.001,
+        depth_max_depth: float = 10.0,
+        num_imagenet_classes: int = 1000,
+        class_ids: Optional[list] = None,
+        class_names: Optional[list] = None,
+        detection_num_classes: int = 80,
+        detection_hidden: int = 160,
+        detection_n_std_layers: int = 5,
+        detection_n_dw_layers: int = 4,
+        detection_n_scales: int = 4,
+        detection_pos_emb_dim: int = 64,
+        detection_text_embed_dim: int = 768,
+        detection_class_names: Optional[list] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.patch_size = patch_size
+        self.num_seg_classes = num_seg_classes
+        self.depth_n_bins = depth_n_bins
+        self.depth_min_depth = depth_min_depth
+        self.depth_max_depth = depth_max_depth
+        self.num_imagenet_classes = num_imagenet_classes
+        self.class_ids = class_ids or []
+        self.class_names = class_names or []
+        self.detection_num_classes = detection_num_classes
+        self.detection_hidden = detection_hidden
+        self.detection_n_std_layers = detection_n_std_layers
+        self.detection_n_dw_layers = detection_n_dw_layers
+        self.detection_n_scales = detection_n_scales
+        self.detection_pos_emb_dim = detection_pos_emb_dim
+        self.detection_text_embed_dim = detection_text_embed_dim
+        self.detection_class_names = detection_class_names or list(COCO_CLASSES)
+class Argus(PreTrainedModel):
+    config_class = ArgusConfig
+    base_model_prefix = "argus"
+    supports_gradient_checkpointing = False
+    _tied_weights_keys: list = []
+    all_tied_weights_keys: dict = {}
+    def __init__(self, config: ArgusConfig):
+        super().__init__(config)
+        self.backbone = build_eupe_vitb16()
+        self.seg_head = SegmentationHead(config.embed_dim, config.num_seg_classes)
+        self.depth_head = DPTDepthDecoder(
+            in_dim=config.embed_dim,
+            decoder_dim=256,
+            n_bins=config.depth_n_bins,
+            min_depth=config.depth_min_depth,
+            max_depth=config.depth_max_depth,
+        )
+        self.register_buffer(
+            "class_logit_weight",
+            torch.zeros(config.num_imagenet_classes, config.embed_dim),
+            persistent=True,
+        )
+        self.register_buffer(
+            "class_logit_bias",
+            torch.zeros(config.num_imagenet_classes),
+            persistent=True,
+        )
+        self.detection_head = SplitTowerHead(
+            feat_dim=config.embed_dim,
+            hidden=config.detection_hidden,
+            n_std_layers=config.detection_n_std_layers,
+            n_dw_layers=config.detection_n_dw_layers,
+            n_scales=config.detection_n_scales,
+            pos_emb_dim=config.detection_pos_emb_dim,
+            num_classes=config.detection_num_classes,
+            text_embed_dim=config.detection_text_embed_dim,
+        )
+        for p in self.backbone.parameters():
+            p.requires_grad = False
+        self.backbone.eval()
+        self.seg_head.eval()
+        self.depth_head.eval()
+        self.detection_head.eval()
+    def _init_weights(self, module):
+        # HF reallocates missing buffers and parameters with torch.empty()
+        # (uninitialized memory) on from_pretrained. Populate sensible defaults
+        # for the standard layer types used by the detection head, and zero any
+        # Argus-level buffer that came back NaN.
+        if isinstance(module, (nn.Conv2d, nn.ConvTranspose2d)):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.GroupNorm):
+            nn.init.ones_(module.weight)
+            nn.init.zeros_(module.bias)
+        if module is self:
+            for name in ("class_logit_weight", "class_logit_bias"):
+                if hasattr(self, name):
+                    buf = getattr(self, name)
+                    if torch.isnan(buf).any() or torch.isinf(buf).any():
+                        buf.data.zero_()
+    @property
+    def class_ids(self):
+        return self.config.class_ids
+    @property
+    def class_names(self):
+        return self.config.class_names
+    def quantize_int8(self):
+        """Apply INT8 weight-only quantization via torchao. Reduces VRAM by ~11%
+        with negligible accuracy loss (<0.05 m depth drift, 100% classification
+        agreement). Requires torchao: pip install torchao."""
+        try:
+            from torchao.quantization import quantize_, Int8WeightOnlyConfig
+        except ImportError as e:
+            raise ImportError("torchao is required for INT8 quantization: pip install torchao") from e
+        quantize_(self, Int8WeightOnlyConfig())
+        return self
+    @torch.inference_mode()
+    def _extract(self, image_tensor: Tensor) -> Tuple[Tensor, Tensor]:
+        with torch.autocast(self.device.type, dtype=torch.bfloat16, enabled=self.device.type == "cuda"):
+            out = self.backbone.forward_features(image_tensor)
+        cls = out["x_norm_clstoken"].float()
+        patches = out["x_norm_patchtokens"].float()
+        B, N, D = patches.shape
+        h = w = int(N ** 0.5)
+        spatial = patches.permute(0, 2, 1).reshape(B, D, h, w)
+        return cls, spatial
+    @torch.inference_mode()
+    def classify(self, image_or_images, top_k: int = 5):
+        single, images = _normalize_image_input(image_or_images)
+        transform = make_eupe_transform(224)
+        batch = torch.stack([transform(img) for img in images]).to(self.device)
+        cls, _ = self._extract(batch)
+        cls = F.normalize(cls, dim=-1)
+        w = self.class_logit_weight.to(cls.dtype)
+        b = self.class_logit_bias.to(cls.dtype)
+        logits = F.linear(cls, w, b)
+        scores_full = F.softmax(logits, dim=-1)
+        topk = scores_full.topk(top_k, dim=-1)
+        top2 = scores_full.topk(2, dim=-1)
+        margins = (top2.values[:, 0] - top2.values[:, 1]).tolist()
+        results = []
+        for b in range(len(images)):
+            entries = []
+            for score, idx in zip(topk.values[b].tolist(), topk.indices[b].tolist()):
+                entries.append({
+                    "class_id": self.class_ids[idx],
+                    "class_name": self.class_names[idx],
+                    "score": float(score),
+                })
+            entries[0]["margin"] = float(margins[b])
+            results.append(entries)
+        return results[0] if single else results
+    @torch.inference_mode()
+    def segment(self, image_or_images, resolution: int = 512, return_confidence: bool = False):
+        single, images = _normalize_image_input(image_or_images)
+        transform = make_eupe_transform(resolution)
+        batch = torch.stack([transform(img) for img in images]).to(self.device)
+        _, spatial = self._extract(batch)
+        with torch.autocast(self.device.type, dtype=torch.bfloat16, enabled=self.device.type == "cuda"):
+            logits = self.seg_head(spatial)
+        logits = F.interpolate(logits, size=(resolution, resolution), mode="bilinear", align_corners=False)
+        seg_maps = logits.argmax(dim=1)  # [B, H, W]
+        if return_confidence:
+            probs = F.softmax(logits.float(), dim=1)
+            conf_maps = probs.max(dim=1).values  # [B, H, W] in [0, 1]
+            if single:
+                return seg_maps[0], conf_maps[0]
+            return [(seg_maps[i], conf_maps[i]) for i in range(len(images))]
+        if single:
+            return seg_maps[0]
+        return [seg_maps[i] for i in range(len(images))]
+    @torch.inference_mode()
+    def depth(self, image_or_images, resolution: int = 416, return_confidence: bool = False):
+        single, images = _normalize_image_input(image_or_images)
+        transform = make_eupe_transform(resolution)
+        batch = torch.stack([transform(img) for img in images]).to(self.device)
+        # Hook into intermediate ViT blocks for multi-scale features
+        intermediates = {}
+        hooks = []
+        for idx in HOOK_BLOCK_INDICES:
+            def _make_hook(block_idx):
+                def _hook(module, inp, out):
+                    intermediates[block_idx] = out[0] if isinstance(out, list) else out
+                return _hook
+            hooks.append(self.backbone.blocks[idx].register_forward_hook(_make_hook(idx)))
+        with torch.autocast(self.device.type, dtype=torch.bfloat16, enabled=self.device.type == "cuda"):
+            self.backbone.forward_features(batch)
+        for h in hooks:
+            h.remove()
+        inter_list = [intermediates[idx].float() for idx in HOOK_BLOCK_INDICES]
+        H = W = resolution // 16
+        if return_confidence:
+            depth_b, distribution, bins = self.depth_head(
+                inter_list, H, W, return_distribution=True)
+            # Std of the 256-bin depth distribution: var = E[X^2] - E[X]^2.
+            mean_sq = torch.einsum("bkhw,k->bhw", distribution, bins ** 2)
+            variance = (mean_sq - depth_b.squeeze(1) ** 2).clamp(min=0)
+            std_b = torch.sqrt(variance).unsqueeze(1)
+        else:
+            depth_b = self.depth_head(inter_list, H, W)
+            std_b = None
+        # Crop the DPT fusion border artifact (zero-padding in the conv chain
+        # produces systematically wrong edge values that compound across 4 stages)
+        crop = max(4, depth_b.shape[2] // 13)
+        depth_b = depth_b[:, :, crop:-crop, crop:-crop]
+        depth_b = F.interpolate(depth_b, size=(resolution, resolution), mode="bilinear", align_corners=False)
+        if std_b is not None:
+            std_b = std_b[:, :, crop:-crop, crop:-crop]
+            std_b = F.interpolate(std_b, size=(resolution, resolution), mode="bilinear", align_corners=False)
+        depth_squeezed = depth_b[:, 0].float()
+        if return_confidence:
+            std_squeezed = std_b[:, 0].float()
+            if single:
+                return depth_squeezed[0], std_squeezed[0]
+            return [(depth_squeezed[i], std_squeezed[i]) for i in range(len(images))]
+        if single:
+            return depth_squeezed[0]
+        return [depth_squeezed[i] for i in range(len(images))]
+    @torch.inference_mode()
+    def correspond(
+        self,
+        src_image: Image.Image,
+        tgt_image: Image.Image,
+        src_keypoints: list,
+        resolution: int = 512,
+    ):
+        sw, sh = src_image.size
+        tw, th = tgt_image.size
+        transform = make_eupe_transform(resolution)
+        src_t = transform(src_image).unsqueeze(0).to(self.device)
+        tgt_t = transform(tgt_image).unsqueeze(0).to(self.device)
+        _, src_feats = self._extract(src_t)
+        _, tgt_feats = self._extract(tgt_t)
+        src_feats = F.interpolate(src_feats, size=(resolution, resolution), mode="bilinear", align_corners=False)
+        tgt_feats = F.interpolate(tgt_feats, size=(resolution, resolution), mode="bilinear", align_corners=False)
+        src_feats = F.normalize(src_feats[0].permute(1, 2, 0), dim=-1)
+        tgt_feats = F.normalize(tgt_feats[0].permute(1, 2, 0), dim=-1)
+        preds = []
+        for kp in src_keypoints:
+            sx = min(max(int(kp[0] / sw * resolution), 0), resolution - 1)
+            sy = min(max(int(kp[1] / sh * resolution), 0), resolution - 1)
+            src_vec = src_feats[sy, sx]
+            sim_map = torch.einsum("d,hwd->hw", src_vec, tgt_feats)
+            flat = sim_map.argmax().item()
+            py, px = flat // resolution, flat % resolution
+            preds.append([px / resolution * tw, py / resolution * th])
+        return preds
+    @torch.inference_mode()
+    def detect(
+        self,
+        image_or_images,
+        resolution: int = 768,
+        score_thresh: float = 0.05,
+        nms_thresh: float = 0.5,
+        max_per_image: int = 100,
+    ):
+        single, images = _normalize_image_input(image_or_images)
+        # Letterbox each image to match the training transform (resize long side
+        # to `resolution`, pad bottom/right with black). Box coordinates are
+        # recovered after decoding by unscaling.
+        canvases, scales, orig_sizes = [], [], []
+        for img in images:
+            canvas, scale, orig = _letterbox_to_square(img, resolution)
+            canvases.append(canvas)
+            scales.append(scale)
+            orig_sizes.append(orig)
+        det_normalize = v2.Compose([
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
+        ])
+        batch = torch.stack([det_normalize(c) for c in canvases]).to(self.device)
+        _, spatial = self._extract(batch)
+        with torch.autocast(self.device.type, dtype=torch.bfloat16, enabled=self.device.type == "cuda"):
+            cls_logits, box_regs, centernesses = self.detection_head(spatial)
+        cls_logits = [c.float() for c in cls_logits]
+        box_regs = [b.float() for b in box_regs]
+        centernesses = [c.float() for c in centernesses]
+        feature_sizes = [(cl.shape[2], cl.shape[3]) for cl in cls_logits]
+        locations = _make_locations(feature_sizes, FPN_STRIDES, spatial.device)
+        image_sizes = [(resolution, resolution)] * len(images)
+        results = _decode_detections(
+            cls_logits, box_regs, centernesses, locations,
+            image_sizes=image_sizes,
+            score_thresh=score_thresh,
+            nms_thresh=nms_thresh,
+            max_per_image=max_per_image,
+        )
+        class_names = self.config.detection_class_names
+        formatted = []
+        for i, r in enumerate(results):
+            scale = scales[i]
+            orig_w, orig_h = orig_sizes[i]
+            boxes = r["boxes"].cpu().numpy() / scale
+            boxes[:, 0::2] = boxes[:, 0::2].clip(0, orig_w)
+            boxes[:, 1::2] = boxes[:, 1::2].clip(0, orig_h)
+            detections = []
+            for box, score, label in zip(
+                boxes, r["scores"].cpu().numpy(), r["labels"].cpu().numpy()
+            ):
+                detections.append({
+                    "box": [float(v) for v in box.tolist()],
+                    "score": float(score),
+                    "label": int(label),
+                    "class_name": class_names[int(label)] if int(label) < len(class_names) else f"class_{int(label)}",
+                })
+            formatted.append(detections)
+        return formatted[0] if single else formatted
+    def perceive(self, image_or_images, return_confidence: bool = False):
+        single, images = _normalize_image_input(image_or_images)
+        t0 = time.time()
+        classif = self.classify(images, top_k=5)
+        t1 = time.time()
+        seg_out = self.segment(images, resolution=512, return_confidence=return_confidence)
+        t2 = time.time()
+        depth_out = self.depth(images, resolution=416, return_confidence=return_confidence)
+        t3 = time.time()
+        if return_confidence:
+            seg_maps = [s for s, _ in seg_out]
+            seg_confs = [c for _, c in seg_out]
+            depth_maps = [d for d, _ in depth_out]
+            depth_uncerts = [u for _, u in depth_out]
+        else:
+            seg_maps = seg_out
+            depth_maps = depth_out
+            seg_confs = depth_uncerts = None
+        timings = {
+            "classify": (t1 - t0) * 1000,
+            "segment": (t2 - t1) * 1000,
+            "depth": (t3 - t2) * 1000,
+            "total": (t3 - t0) * 1000,
+        }
+        results = []
+        for i in range(len(images)):
+            entry = {
+                "classification": classif[i],
+                "segmentation": seg_maps[i].cpu().numpy(),
+                "depth": depth_maps[i].cpu().numpy(),
+                "timings_ms": timings,
+            }
+            if return_confidence:
+                entry["segmentation_confidence"] = seg_confs[i].cpu().numpy()
+                entry["depth_uncertainty"] = depth_uncerts[i].cpu().numpy()
+            results.append(entry)
+        return results[0] if single else results
+    def export_onnx(
+        self,
+        out_dir: str,
+        backbone_resolution: int = 224,
+        dynamic_batch: bool = True,
+        verify: bool = True,
+        tolerance: Union[float, Dict[str, float]] = 5e-2,
+        opset_version: int = 17,
+        include_nms: bool = False,
+        nms_iou_threshold: float = 0.5,
+        nms_score_threshold: float = 0.05,
+        nms_max_detections: int = 100,
+    ) -> dict:
+        """Export backbone, classifier, seg head, depth head, and detection head to ONNX.
+        Produces five graphs:
+          - argus_backbone.onnx       image[B,3,H,W]              -> cls[B,D], spatial[B,D,H/16,W/16]
+          - argus_classifier.onnx     cls_token[B,D]              -> probs[B,1000]
+          - argus_seg_head.onnx       spatial_features[B,D,h,w]   -> seg_logits[B,150,H,W]
+          - argus_depth_head.onnx     intermediate_{0..3}[B,N+5,D] -> depth_map[B,1,~8h,~8w]
+          - argus_detection_head.onnx spatial_features[B,D,h,w]   -> boxes, scores (+ labels, batch_indices if include_nms)
+        The seg graph folds bilinear upsample to input resolution into the
+        graph, so consumers argmax directly without a separate interpolation
+        step. Correspondence has no learned parameters — it runs as
+        cosine-max on the backbone's spatial output and needs no graph.
+        ``include_nms=True`` bakes an ONNX NonMaxSuppression (opset >= 10)
+        op into the detection head. The detection graph then emits four
+        post-NMS tensors (boxes [M,4], scores [M], class_labels [M],
+        batch_indices [M]) instead of the raw (boxes, scores) pair. Useful
+        for single-shot TensorRT / mobile inference. The default
+        ``include_nms=False`` leaves NMS to the consumer so they can choose
+        hard vs soft, per-class vs global, and tune thresholds without
+        re-exporting.
+        ``tolerance`` can be a float (applied uniformly to every
+        ``*_max_diff`` check) or a dict keyed by verification output name
+        (e.g. ``{"detection_boxes_max_diff": 3.2, "default": 5e-2}``). The
+        ``"default"`` key covers outputs not otherwise listed. If a float
+        is passed, detection box coordinates get a resolution-scaled
+        tolerance (``max(tolerance, backbone_resolution * 5e-3)``) because
+        exp() in the FCOS regression path amplifies FP kernel-dispatch
+        differences to pixel-scale absolute diffs.
+        """
+        import os
+        os.makedirs(out_dir, exist_ok=True)
+        if backbone_resolution % self.config.patch_size != 0:
+            raise ValueError(
+                f"backbone_resolution ({backbone_resolution}) must be a multiple of patch_size ({self.config.patch_size})"
+            )
+        spatial_resolution = backbone_resolution // self.config.patch_size
+        if backbone_resolution < 320:
+            import warnings
+            warnings.warn(
+                f"backbone_resolution={backbone_resolution} is below 320; the detection "
+                f"head's coarsest FPN level (stride 128) collapses to <=2 locations per "
+                f"side and the detection graph, while it exports and runs, cannot produce "
+                f"useful detections at this resolution. Classifier, seg, and depth graphs "
+                f"are unaffected. FCOS convention is 640-800px input; export at "
+                f">= 512 for detection.",
+                stacklevel=2,
+            )
+        wrapper = _BackboneExportWrapper(self.backbone).to(self.device).eval()
+        dummy_image = torch.randn(
+            1, 3, backbone_resolution, backbone_resolution,
+            device=self.device, dtype=torch.float32,
+        )
+        dummy_spatial = torch.randn(
+            1, self.config.embed_dim, spatial_resolution, spatial_resolution,
+            device=self.device, dtype=torch.float32,
+        )
+        backbone_path = os.path.join(out_dir, "argus_backbone.onnx")
+        classifier_path = os.path.join(out_dir, "argus_classifier.onnx")
+        seg_path = os.path.join(out_dir, "argus_seg_head.onnx")
+        depth_path = os.path.join(out_dir, "argus_depth_head.onnx")
+        detection_path = os.path.join(out_dir, "argus_detection_head.onnx")
+        backbone_axes = None
+        head_axes = None
+        if dynamic_batch:
+            backbone_axes = {
+                "image": {0: "batch"},
+                "cls_token": {0: "batch"},
+                "spatial_features": {0: "batch"},
+            }
+            head_axes = {
+                "spatial_features": {0: "batch"},
+                "seg_logits": {0: "batch"},
+                "depth_map": {0: "batch"},
+            }
+        # dynamo path crashes on EUPE's list-based forward; use legacy.
+        with torch.inference_mode():
+            torch.onnx.export(
+                wrapper, dummy_image, backbone_path,
+                input_names=["image"],
+                output_names=["cls_token", "spatial_features"],
+                dynamic_axes=backbone_axes,
+                opset_version=opset_version,
+                do_constant_folding=True,
+                dynamo=False,
+            )
+            seg_wrapper = _SegHeadExportWrapper(self.seg_head, backbone_resolution).to(self.device).eval()
+            torch.onnx.export(
+                seg_wrapper, dummy_spatial, seg_path,
+                input_names=["spatial_features"],
+                output_names=["seg_logits"],
+                dynamic_axes={"spatial_features": head_axes["spatial_features"], "seg_logits": head_axes["seg_logits"]} if head_axes else None,
+                opset_version=opset_version,
+                do_constant_folding=True,
+                dynamo=False,
+            )
+            depth_wrapper = _DepthHeadExportWrapper(
+                self.depth_head, spatial_resolution, spatial_resolution
+            ).to(self.device).eval()
+            num_patch_tokens = spatial_resolution * spatial_resolution + N_PREFIX_TOKENS
+            dummy_inter = tuple(
+                torch.randn(1, num_patch_tokens, self.config.embed_dim,
+                            device=self.device, dtype=torch.float32)
+                for _ in range(len(HOOK_BLOCK_INDICES))
+            )
+            depth_input_names = [f"intermediate_{i}" for i in range(len(HOOK_BLOCK_INDICES))]
+            if dynamic_batch:
+                depth_axes = {name: {0: "batch"} for name in depth_input_names}
+                depth_axes["depth_map"] = {0: "batch"}
+            else:
+                depth_axes = None
+            torch.onnx.export(
+                depth_wrapper, dummy_inter, depth_path,
+                input_names=depth_input_names,
+                output_names=["depth_map"],
+                dynamic_axes=depth_axes,
+                opset_version=opset_version,
+                do_constant_folding=True,
+                dynamo=False,
+            )
+            classifier_wrapper = _ClassifierExportWrapper(
+                self.class_logit_weight, self.class_logit_bias
+            ).to(self.device).eval()
+            dummy_cls = torch.randn(
+                1, self.config.embed_dim, device=self.device, dtype=torch.float32,
+            )
+            if dynamic_batch:
+                classifier_axes = {"cls_token": {0: "batch"}, "class_probs": {0: "batch"}}
+            else:
+                classifier_axes = None
+            torch.onnx.export(
+                classifier_wrapper, dummy_cls, classifier_path,
+                input_names=["cls_token"],
+                output_names=["class_probs"],
+                dynamic_axes=classifier_axes,
+                opset_version=opset_version,
+                do_constant_folding=True,
+                dynamo=False,
+            )
+            detection_wrapper = _DetectionHeadExportWrapper(
+                self.detection_head, backbone_resolution,
+                include_nms=include_nms,
+                nms_iou_threshold=nms_iou_threshold,
+                nms_score_threshold=nms_score_threshold,
+                nms_max_detections=nms_max_detections,
+            ).to(self.device).eval()
+            if include_nms:
+                detection_output_names = ["boxes", "scores", "class_labels", "batch_indices"]
+                # Post-NMS outputs are flat [M, ...]; no fixed batch axis to mark.
+                # Spatial features input still has a dynamic batch dim so the graph
+                # supports multi-image inference even with fused NMS.
+                detection_axes = {"spatial_features": {0: "batch"}} if dynamic_batch else None
+            else:
+                detection_output_names = ["boxes", "scores"]
+                if dynamic_batch:
+                    detection_axes = {
+                        "spatial_features": {0: "batch"},
+                        "boxes": {0: "batch"},
+                        "scores": {0: "batch"},
+                    }
+                else:
+                    detection_axes = None
+            torch.onnx.export(
+                detection_wrapper, dummy_spatial, detection_path,
+                input_names=["spatial_features"],
+                output_names=detection_output_names,
+                dynamic_axes=detection_axes,
+                opset_version=opset_version,
+                do_constant_folding=True,
+                dynamo=False,
+            )
+        result = {
+            "backbone": backbone_path,
+            "classifier": classifier_path,
+            "seg_head": seg_path,
+            "depth_head": depth_path,
+            "detection_head": detection_path,
+        }
+        if verify:
+            try:
+                import onnxruntime as ort
+            except ImportError as e:
+                raise ImportError("onnxruntime not installed; pip install onnxruntime") from e
+            providers = ["CPUExecutionProvider"]
+            verify_image = torch.randn(2, 3, backbone_resolution, backbone_resolution, dtype=torch.float32)
+            verify_spatial = torch.randn(2, self.config.embed_dim, spatial_resolution, spatial_resolution, dtype=torch.float32)
+            verify_cls = torch.randn(2, self.config.embed_dim, dtype=torch.float32)
+            verify_inter = [
+                torch.randn(2, num_patch_tokens, self.config.embed_dim, dtype=torch.float32)
+                for _ in range(len(HOOK_BLOCK_INDICES))
+            ]
+            with torch.inference_mode():
+                ref_cls, ref_spatial = wrapper(verify_image.to(self.device))
+                ref_seg = seg_wrapper(verify_spatial.to(self.device))
+                ref_depth = depth_wrapper(*[v.to(self.device) for v in verify_inter])
+                ref_probs = classifier_wrapper(verify_cls.to(self.device))
+                ref_det = detection_wrapper(verify_spatial.to(self.device))
+            sess = ort.InferenceSession(backbone_path, providers=providers)
+            ort_cls, ort_spatial = sess.run(None, {"image": verify_image.numpy()})
+            cls_diff = float(np.abs(ort_cls - ref_cls.cpu().numpy()).max())
+            spatial_diff = float(np.abs(ort_spatial - ref_spatial.cpu().numpy()).max())
+            sess = ort.InferenceSession(seg_path, providers=providers)
+            ort_seg = sess.run(None, {"spatial_features": verify_spatial.numpy()})[0]
+            seg_diff = float(np.abs(ort_seg - ref_seg.cpu().numpy()).max())
+            sess = ort.InferenceSession(depth_path, providers=providers)
+            ort_depth = sess.run(None, {f"intermediate_{i}": verify_inter[i].numpy()
+                                        for i in range(len(HOOK_BLOCK_INDICES))})[0]
+            depth_diff = float(np.abs(ort_depth - ref_depth.cpu().numpy()).max())
+            sess = ort.InferenceSession(classifier_path, providers=providers)
+            ort_probs = sess.run(None, {"cls_token": verify_cls.numpy()})[0]
+            classifier_diff = float(np.abs(ort_probs - ref_probs.cpu().numpy()).max())
+            sess = ort.InferenceSession(detection_path, providers=providers)
+            ort_det = sess.run(None, {"spatial_features": verify_spatial.numpy()})
+            verification = {
+                "backbone_cls_max_diff": cls_diff,
+                "backbone_spatial_max_diff": spatial_diff,
+                "classifier_max_diff": classifier_diff,
+                "seg_head_max_diff": seg_diff,
+                "depth_head_max_diff": depth_diff,
+                "verified_batch_size": 2,
+            }
+            if include_nms:
+                # NMS is inherently implementation-dependent: ONNX's
+                # NonMaxSuppression and the torchvision eager fallback differ
+                # on tie-breaking when multiple detections share a score or
+                # when near-threshold boxes are right at the score cutoff.
+                # Element-wise comparison of post-NMS outputs is the wrong
+                # metric. The structural checks below verify the graph runs,
+                # returns reasonable shapes, and agrees on the top detection.
+                pt_boxes, pt_scores, pt_labels, _ = ref_det
+                ort_boxes, ort_scores, ort_labels, _ = ort_det
+                pt_n = int(pt_scores.shape[0])
+                ort_n = int(ort_scores.shape[0])
+                verification["detection_nms_ref_count"] = pt_n
+                verification["detection_nms_ort_count"] = ort_n
+                if pt_n > 0 and ort_n > 0:
+                    pt_top = int(pt_scores.cpu().numpy().argmax())
+                    ort_top = int(ort_scores.argmax())
+                    pt_top_box = pt_boxes[pt_top].cpu().numpy()
+                    ort_top_box = ort_boxes[ort_top]
+                    # IoU of the two top boxes
+                    x1 = max(pt_top_box[0], ort_top_box[0])
+                    y1 = max(pt_top_box[1], ort_top_box[1])
+                    x2 = min(pt_top_box[2], ort_top_box[2])
+                    y2 = min(pt_top_box[3], ort_top_box[3])
+                    inter = max(0.0, x2 - x1) * max(0.0, y2 - y1)
+                    pt_area = max(0.0, pt_top_box[2] - pt_top_box[0]) * max(0.0, pt_top_box[3] - pt_top_box[1])
+                    ort_area = max(0.0, ort_top_box[2] - ort_top_box[0]) * max(0.0, ort_top_box[3] - ort_top_box[1])
+                    union = max(1e-6, pt_area + ort_area - inter)
+                    verification["detection_nms_top_iou"] = float(inter / union)
+                    verification["detection_nms_top_class_match"] = bool(
+                        int(pt_labels[pt_top].cpu()) == int(ort_labels[ort_top])
+                    )
+                    verification["detection_nms_top_score_diff"] = float(abs(
+                        float(pt_scores[pt_top].cpu()) - float(ort_scores[ort_top])
+                    ))
+                else:
+                    verification["detection_nms_top_iou"] = None
+                    verification["detection_nms_top_class_match"] = None
+                    verification["detection_nms_top_score_diff"] = None
+            else:
+                ort_boxes, ort_scores = ort_det
+                ref_boxes, ref_scores = ref_det
+                verification["detection_boxes_max_diff"] = float(
+                    np.abs(ort_boxes - ref_boxes.cpu().numpy()).max())
+                verification["detection_scores_max_diff"] = float(
+                    np.abs(ort_scores - ref_scores.cpu().numpy()).max())
+            # Tolerance resolution: either a float applied uniformly, or a dict
+            # keyed by verification output name (with optional "default" key).
+            # Detection boxes get a resolution-scaled tolerance when only a
+            # float is supplied — exp() in the FCOS regression path amplifies
+            # FP kernel-dispatch differences to pixel-scale absolute diffs.
+            if isinstance(tolerance, dict):
+                default_tol = float(tolerance.get("default", 5e-2))
+                def _tol_for(key):
+                    return float(tolerance.get(key, default_tol))
+                verification["tolerance"] = dict(tolerance)
+            else:
+                base = float(tolerance)
+                box_tol = max(base, backbone_resolution * 5e-3)
+                def _tol_for(key):
+                    return box_tol if key == "detection_boxes_max_diff" else base
+                verification["tolerance"] = base
+                verification["detection_boxes_tolerance"] = box_tol
+            for key, val in list(verification.items()):
+                if not key.endswith("_max_diff"):
+                    continue
+                t = _tol_for(key)
+                if val > t:
+                    raise RuntimeError(
+                        f"ONNX/PyTorch divergence in {key}: {val:.2e} > tolerance {t:.2e}"
+                    )
+            result["verification"] = verification
+        return result

config.json ADDED Viewed

	@@ -0,0 +1,2029 @@

+{
+  "architectures": [
+    "Argus"
+  ],
+  "auto_map": {
+    "AutoConfig": "argus.ArgusConfig",
+    "AutoModel": "argus.Argus"
+  },
+  "model_type": "argus",
+  "embed_dim": 768,
+  "patch_size": 16,
+  "num_seg_classes": 150,
+  "depth_n_bins": 256,
+  "depth_min_depth": 0.001,
+  "depth_max_depth": 10.0,
+  "num_imagenet_classes": 1000,
+  "class_ids": [
+    "n01440764",
+    "n01443537",
+    "n01484850",
+    "n01491361",
+    "n01494475",
+    "n01496331",
+    "n01498041",
+    "n01514668",
+    "n01514859",
+    "n01518878",
+    "n01530575",
+    "n01531178",
+    "n01532829",
+    "n01534433",
+    "n01537544",
+    "n01558993",
+    "n01560419",
+    "n01580077",
+    "n01582220",
+    "n01592084",
+    "n01601694",
+    "n01608432",
+    "n01614925",
+    "n01616318",
+    "n01622779",
+    "n01629819",
+    "n01630670",
+    "n01631663",
+    "n01632458",
+    "n01632777",
+    "n01641577",
+    "n01644373",
+    "n01644900",
+    "n01664065",
+    "n01665541",
+    "n01667114",
+    "n01667778",
+    "n01669191",
+    "n01675722",
+    "n01677366",
+    "n01682714",
+    "n01685808",
+    "n01687978",
+    "n01688243",
+    "n01689811",
+    "n01692333",
+    "n01693334",
+    "n01694178",
+    "n01695060",
+    "n01697457",
+    "n01698640",
+    "n01704323",
+    "n01728572",
+    "n01728920",
+    "n01729322",
+    "n01729977",
+    "n01734418",
+    "n01735189",
+    "n01737021",
+    "n01739381",
+    "n01740131",
+    "n01742172",
+    "n01744401",
+    "n01748264",
+    "n01749939",
+    "n01751748",
+    "n01753488",
+    "n01755581",
+    "n01756291",
+    "n01768244",
+    "n01770081",
+    "n01770393",
+    "n01773157",
+    "n01773549",
+    "n01773797",
+    "n01774384",
+    "n01774750",
+    "n01775062",
+    "n01776313",
+    "n01784675",
+    "n01795545",
+    "n01796340",
+    "n01797886",
+    "n01798484",
+    "n01806143",
+    "n01806567",
+    "n01807496",
+    "n01817953",
+    "n01818515",
+    "n01819313",
+    "n01820546",
+    "n01824575",
+    "n01828970",
+    "n01829413",
+    "n01833805",
+    "n01843065",
+    "n01843383",
+    "n01847000",
+    "n01855032",
+    "n01855672",
+    "n01860187",
+    "n01871265",
+    "n01872401",
+    "n01873310",
+    "n01877812",
+    "n01882714",
+    "n01883070",
+    "n01910747",
+    "n01914609",
+    "n01917289",
+    "n01924916",
+    "n01930112",
+    "n01943899",
+    "n01944390",
+    "n01945685",
+    "n01950731",
+    "n01955084",
+    "n01968897",
+    "n01978287",
+    "n01978455",
+    "n01980166",
+    "n01981276",
+    "n01983481",
+    "n01984695",
+    "n01985128",
+    "n01986214",
+    "n01990800",
+    "n02002556",
+    "n02002724",
+    "n02006656",
+    "n02007558",
+    "n02009229",
+    "n02009912",
+    "n02011460",
+    "n02012849",
+    "n02013706",
+    "n02017213",
+    "n02018207",
+    "n02018795",
+    "n02025239",
+    "n02027492",
+    "n02028035",
+    "n02033041",
+    "n02037110",
+    "n02051845",
+    "n02056570",
+    "n02058221",
+    "n02066245",
+    "n02071294",
+    "n02074367",
+    "n02077923",
+    "n02085620",
+    "n02085782",
+    "n02085936",
+    "n02086079",
+    "n02086240",
+    "n02086646",
+    "n02086910",
+    "n02087046",
+    "n02087394",
+    "n02088094",
+    "n02088238",
+    "n02088364",
+    "n02088466",
+    "n02088632",
+    "n02089078",
+    "n02089867",
+    "n02089973",
+    "n02090379",
+    "n02090622",
+    "n02090721",
+    "n02091032",
+    "n02091134",
+    "n02091244",
+    "n02091467",
+    "n02091635",
+    "n02091831",
+    "n02092002",
+    "n02092339",
+    "n02093256",
+    "n02093428",
+    "n02093647",
+    "n02093754",
+    "n02093859",
+    "n02093991",
+    "n02094114",
+    "n02094258",
+    "n02094433",
+    "n02095314",
+    "n02095570",
+    "n02095889",
+    "n02096051",
+    "n02096177",
+    "n02096294",
+    "n02096437",
+    "n02096585",
+    "n02097047",
+    "n02097130",
+    "n02097209",
+    "n02097298",
+    "n02097474",
+    "n02097658",
+    "n02098105",
+    "n02098286",
+    "n02098413",
+    "n02099267",
+    "n02099429",
+    "n02099601",
+    "n02099712",
+    "n02099849",
+    "n02100236",
+    "n02100583",
+    "n02100735",
+    "n02100877",
+    "n02101006",
+    "n02101388",
+    "n02101556",
+    "n02102040",
+    "n02102177",
+    "n02102318",
+    "n02102480",
+    "n02102973",
+    "n02104029",
+    "n02104365",
+    "n02105056",
+    "n02105162",
+    "n02105251",
+    "n02105412",
+    "n02105505",
+    "n02105641",
+    "n02105855",
+    "n02106030",
+    "n02106166",
+    "n02106382",
+    "n02106550",
+    "n02106662",
+    "n02107142",
+    "n02107312",
+    "n02107574",
+    "n02107683",
+    "n02107908",
+    "n02108000",
+    "n02108089",
+    "n02108422",
+    "n02108551",
+    "n02108915",
+    "n02109047",
+    "n02109525",
+    "n02109961",
+    "n02110063",
+    "n02110185",
+    "n02110341",
+    "n02110627",
+    "n02110806",
+    "n02110958",
+    "n02111129",
+    "n02111277",
+    "n02111500",
+    "n02111889",
+    "n02112018",
+    "n02112137",
+    "n02112350",
+    "n02112706",
+    "n02113023",
+    "n02113186",
+    "n02113624",
+    "n02113712",
+    "n02113799",
+    "n02113978",
+    "n02114367",
+    "n02114548",
+    "n02114712",
+    "n02114855",
+    "n02115641",
+    "n02115913",
+    "n02116738",
+    "n02117135",
+    "n02119022",
+    "n02119789",
+    "n02120079",
+    "n02120505",
+    "n02123045",
+    "n02123159",
+    "n02123394",
+    "n02123597",
+    "n02124075",
+    "n02125311",
+    "n02127052",
+    "n02128385",
+    "n02128757",
+    "n02128925",
+    "n02129165",
+    "n02129604",
+    "n02130308",
+    "n02132136",
+    "n02133161",
+    "n02134084",
+    "n02134418",
+    "n02137549",
+    "n02138441",
+    "n02165105",
+    "n02165456",
+    "n02167151",
+    "n02168699",
+    "n02169497",
+    "n02172182",
+    "n02174001",
+    "n02177972",
+    "n02190166",
+    "n02206856",
+    "n02219486",
+    "n02226429",
+    "n02229544",
+    "n02231487",
+    "n02233338",
+    "n02236044",
+    "n02256656",
+    "n02259212",
+    "n02264363",
+    "n02268443",
+    "n02268853",
+    "n02276258",
+    "n02277742",
+    "n02279972",
+    "n02280649",
+    "n02281406",
+    "n02281787",
+    "n02317335",
+    "n02319095",
+    "n02321529",
+    "n02325366",
+    "n02326432",
+    "n02328150",
+    "n02342885",
+    "n02346627",
+    "n02356798",
+    "n02361337",
+    "n02363005",
+    "n02364673",
+    "n02389026",
+    "n02391049",
+    "n02395406",
+    "n02396427",
+    "n02397096",
+    "n02398521",
+    "n02403003",
+    "n02408429",
+    "n02410509",
+    "n02412080",
+    "n02415577",
+    "n02417914",
+    "n02422106",
+    "n02422699",
+    "n02423022",
+    "n02437312",
+    "n02437616",
+    "n02441942",
+    "n02442845",
+    "n02443114",
+    "n02443484",
+    "n02444819",
+    "n02445715",
+    "n02447366",
+    "n02454379",
+    "n02457408",
+    "n02480495",
+    "n02480855",
+    "n02481823",
+    "n02483362",
+    "n02483708",
+    "n02484975",
+    "n02486261",
+    "n02486410",
+    "n02487347",
+    "n02488291",
+    "n02488702",
+    "n02489166",
+    "n02490219",
+    "n02492035",
+    "n02492660",
+    "n02493509",
+    "n02493793",
+    "n02494079",
+    "n02497673",
+    "n02500267",
+    "n02504013",
+    "n02504458",
+    "n02509815",
+    "n02510455",
+    "n02514041",
+    "n02526121",
+    "n02536864",
+    "n02606052",
+    "n02607072",
+    "n02640242",
+    "n02641379",
+    "n02643566",
+    "n02655020",
+    "n02666196",
+    "n02667093",
+    "n02669723",
+    "n02672831",
+    "n02676566",
+    "n02687172",
+    "n02690373",
+    "n02692877",
+    "n02699494",
+    "n02701002",
+    "n02704792",
+    "n02708093",
+    "n02727426",
+    "n02730930",
+    "n02747177",
+    "n02749479",
+    "n02769748",
+    "n02776631",
+    "n02777292",
+    "n02782093",
+    "n02783161",
+    "n02786058",
+    "n02787622",
+    "n02788148",
+    "n02790996",
+    "n02791124",
+    "n02791270",
+    "n02793495",
+    "n02794156",
+    "n02795169",
+    "n02797295",
+    "n02799071",
+    "n02802426",
+    "n02804414",
+    "n02804610",
+    "n02807133",
+    "n02808304",
+    "n02808440",
+    "n02814533",
+    "n02814860",
+    "n02815834",
+    "n02817516",
+    "n02823428",
+    "n02823750",
+    "n02825657",
+    "n02834397",
+    "n02835271",
+    "n02837789",
+    "n02840245",
+    "n02841315",
+    "n02843684",
+    "n02859443",
+    "n02860847",
+    "n02865351",
+    "n02869837",
+    "n02870880",
+    "n02871525",
+    "n02877765",
+    "n02879718",
+    "n02883205",
+    "n02892201",
+    "n02892767",
+    "n02894605",
+    "n02895154",
+    "n02906734",
+    "n02909870",
+    "n02910353",
+    "n02916936",
+    "n02917067",
+    "n02927161",
+    "n02930766",
+    "n02939185",
+    "n02948072",
+    "n02950826",
+    "n02951358",
+    "n02951585",
+    "n02963159",
+    "n02965783",
+    "n02966193",
+    "n02966687",
+    "n02971356",
+    "n02974003",
+    "n02977058",
+    "n02978881",
+    "n02979186",
+    "n02980441",
+    "n02981792",
+    "n02988304",
+    "n02992211",
+    "n02992529",
+    "n02999410",
+    "n03000134",
+    "n03000247",
+    "n03000684",
+    "n03014705",
+    "n03016953",
+    "n03017168",
+    "n03018349",
+    "n03026506",
+    "n03028079",
+    "n03032252",
+    "n03041632",
+    "n03042490",
+    "n03045698",
+    "n03047690",
+    "n03062245",
+    "n03063599",
+    "n03063689",
+    "n03065424",
+    "n03075370",
+    "n03085013",
+    "n03089624",
+    "n03095699",
+    "n03100240",
+    "n03109150",
+    "n03110669",
+    "n03124043",
+    "n03124170",
+    "n03125729",
+    "n03126707",
+    "n03127747",
+    "n03127925",
+    "n03131574",
+    "n03133878",
+    "n03134739",
+    "n03141823",
+    "n03146219",
+    "n03160309",
+    "n03179701",
+    "n03180011",
+    "n03187595",
+    "n03188531",
+    "n03196217",
+    "n03197337",
+    "n03201208",
+    "n03207743",
+    "n03207941",
+    "n03208938",
+    "n03216828",
+    "n03218198",
+    "n03220513",
+    "n03223299",
+    "n03240683",
+    "n03249569",
+    "n03250847",
+    "n03255030",
+    "n03259280",
+    "n03271574",
+    "n03272010",
+    "n03272562",
+    "n03290653",
+    "n03291819",
+    "n03297495",
+    "n03314780",
+    "n03325584",
+    "n03337140",
+    "n03344393",
+    "n03345487",
+    "n03347037",
+    "n03355925",
+    "n03372029",
+    "n03376595",
+    "n03379051",
+    "n03384352",
+    "n03388043",
+    "n03388183",
+    "n03388549",
+    "n03393912",
+    "n03394916",
+    "n03400231",
+    "n03404251",
+    "n03417042",
+    "n03424325",
+    "n03425413",
+    "n03443371",
+    "n03444034",
+    "n03445777",
+    "n03445924",
+    "n03447447",
+    "n03447721",
+    "n03450230",
+    "n03452741",
+    "n03457902",
+    "n03459775",
+    "n03461385",
+    "n03467068",
+    "n03476684",
+    "n03476991",
+    "n03478589",
+    "n03481172",
+    "n03482405",
+    "n03483316",
+    "n03485407",
+    "n03485794",
+    "n03492542",
+    "n03494278",
+    "n03495258",
+    "n03496892",
+    "n03498962",
+    "n03527444",
+    "n03529860",
+    "n03530642",
+    "n03532672",
+    "n03534580",
+    "n03535780",
+    "n03538406",
+    "n03544143",
+    "n03584254",
+    "n03584829",
+    "n03590841",
+    "n03594734",
+    "n03594945",
+    "n03595614",
+    "n03598930",
+    "n03599486",
+    "n03602883",
+    "n03617480",
+    "n03623198",
+    "n03627232",
+    "n03630383",
+    "n03633091",
+    "n03637318",
+    "n03642806",
+    "n03649909",
+    "n03657121",
+    "n03658185",
+    "n03661043",
+    "n03662601",
+    "n03666591",
+    "n03670208",
+    "n03673027",
+    "n03676483",
+    "n03680355",
+    "n03690938",
+    "n03691459",
+    "n03692522",
+    "n03697007",
+    "n03706229",
+    "n03709823",
+    "n03710193",
+    "n03710637",
+    "n03710721",
+    "n03717622",
+    "n03720891",
+    "n03721384",
+    "n03724870",
+    "n03729826",
+    "n03733131",
+    "n03733281",
+    "n03733805",
+    "n03742115",
+    "n03743016",
+    "n03759954",
+    "n03761084",
+    "n03763968",
+    "n03764736",
+    "n03769881",
+    "n03770439",
+    "n03770679",
+    "n03773504",
+    "n03775071",
+    "n03775546",
+    "n03776460",
+    "n03777568",
+    "n03777754",
+    "n03781244",
+    "n03782006",
+    "n03785016",
+    "n03786901",
+    "n03787032",
+    "n03788195",
+    "n03788365",
+    "n03791053",
+    "n03792782",
+    "n03792972",
+    "n03793489",
+    "n03794056",
+    "n03796401",
+    "n03803284",
+    "n03804744",
+    "n03814639",
+    "n03814906",
+    "n03825788",
+    "n03832673",
+    "n03837869",
+    "n03838899",
+    "n03840681",
+    "n03841143",
+    "n03843555",
+    "n03854065",
+    "n03857828",
+    "n03866082",
+    "n03868242",
+    "n03868863",
+    "n03871628",
+    "n03873416",
+    "n03874293",
+    "n03874599",
+    "n03876231",
+    "n03877472",
+    "n03877845",
+    "n03884397",
+    "n03887697",
+    "n03888257",
+    "n03888605",
+    "n03891251",
+    "n03891332",
+    "n03895866",
+    "n03899768",
+    "n03902125",
+    "n03903868",
+    "n03908618",
+    "n03908714",
+    "n03916031",
+    "n03920288",
+    "n03924679",
+    "n03929660",
+    "n03929855",
+    "n03930313",
+    "n03930630",
+    "n03933933",
+    "n03935335",
+    "n03937543",
+    "n03938244",
+    "n03942813",
+    "n03944341",
+    "n03947888",
+    "n03950228",
+    "n03954731",
+    "n03956157",
+    "n03958227",
+    "n03961711",
+    "n03967562",
+    "n03970156",
+    "n03976467",
+    "n03976657",
+    "n03977966",
+    "n03980874",
+    "n03982430",
+    "n03983396",
+    "n03991062",
+    "n03992509",
+    "n03995372",
+    "n03998194",
+    "n04004767",
+    "n04005630",
+    "n04008634",
+    "n04009552",
+    "n04019541",
+    "n04023962",
+    "n04026417",
+    "n04033901",
+    "n04033995",
+    "n04037443",
+    "n04039381",
+    "n04040759",
+    "n04041544",
+    "n04044716",
+    "n04049303",
+    "n04065272",
+    "n04067472",
+    "n04069434",
+    "n04070727",
+    "n04074963",
+    "n04081281",
+    "n04086273",
+    "n04090263",
+    "n04099969",
+    "n04111531",
+    "n04116512",
+    "n04118538",
+    "n04118776",
+    "n04120489",
+    "n04125021",
+    "n04127249",
+    "n04131690",
+    "n04133789",
+    "n04136333",
+    "n04141076",
+    "n04141327",
+    "n04141975",
+    "n04146614",
+    "n04147183",
+    "n04149813",
+    "n04152593",
+    "n04153751",
+    "n04154565",
+    "n04162706",
+    "n04179913",
+    "n04192698",
+    "n04200800",
+    "n04201297",
+    "n04204238",
+    "n04204347",
+    "n04208210",
+    "n04209133",
+    "n04209239",
+    "n04228054",
+    "n04229816",
+    "n04235860",
+    "n04238763",
+    "n04239074",
+    "n04243546",
+    "n04251144",
+    "n04252077",
+    "n04252225",
+    "n04254120",
+    "n04254680",
+    "n04254777",
+    "n04258138",
+    "n04259630",
+    "n04263257",
+    "n04264628",
+    "n04265275",
+    "n04266014",
+    "n04270147",
+    "n04273569",
+    "n04275548",
+    "n04277352",
+    "n04285008",
+    "n04286575",
+    "n04296562",
+    "n04310018",
+    "n04311004",
+    "n04311174",
+    "n04317175",
+    "n04325704",
+    "n04326547",
+    "n04328186",
+    "n04330267",
+    "n04332243",
+    "n04335435",
+    "n04336792",
+    "n04344873",
+    "n04346328",
+    "n04347754",
+    "n04350905",
+    "n04355338",
+    "n04355933",
+    "n04356056",
+    "n04357314",
+    "n04366367",
+    "n04367480",
+    "n04370456",
+    "n04371430",
+    "n04371774",
+    "n04372370",
+    "n04376876",
+    "n04380533",
+    "n04389033",
+    "n04392985",
+    "n04398044",
+    "n04399382",
+    "n04404412",
+    "n04409515",
+    "n04417672",
+    "n04418357",
+    "n04423845",
+    "n04428191",
+    "n04429376",
+    "n04435653",
+    "n04442312",
+    "n04443257",
+    "n04447861",
+    "n04456115",
+    "n04458633",
+    "n04461696",
+    "n04462240",
+    "n04465501",
+    "n04467665",
+    "n04476259",
+    "n04479046",
+    "n04482393",
+    "n04483307",
+    "n04485082",
+    "n04486054",
+    "n04487081",
+    "n04487394",
+    "n04493381",
+    "n04501370",
+    "n04505470",
+    "n04507155",
+    "n04509417",
+    "n04515003",
+    "n04517823",
+    "n04522168",
+    "n04523525",
+    "n04525038",
+    "n04525305",
+    "n04532106",
+    "n04532670",
+    "n04536866",
+    "n04540053",
+    "n04542943",
+    "n04548280",
+    "n04548362",
+    "n04550184",
+    "n04552348",
+    "n04553703",
+    "n04554684",
+    "n04557648",
+    "n04560804",
+    "n04562935",
+    "n04579145",
+    "n04579432",
+    "n04584207",
+    "n04589890",
+    "n04590129",
+    "n04591157",
+    "n04591713",
+    "n04592741",
+    "n04596742",
+    "n04597913",
+    "n04599235",
+    "n04604644",
+    "n04606251",
+    "n04612504",
+    "n04613696",
+    "n06359193",
+    "n06596364",
+    "n06785654",
+    "n06794110",
+    "n06874185",
+    "n07248320",
+    "n07565083",
+    "n07579787",
+    "n07583066",
+    "n07584110",
+    "n07590611",
+    "n07613480",
+    "n07614500",
+    "n07615774",
+    "n07684084",
+    "n07693725",
+    "n07695742",
+    "n07697313",
+    "n07697537",
+    "n07711569",
+    "n07714571",
+    "n07714990",
+    "n07715103",
+    "n07716358",
+    "n07716906",
+    "n07717410",
+    "n07717556",
+    "n07718472",
+    "n07718747",
+    "n07720875",
+    "n07730033",
+    "n07734744",
+    "n07742313",
+    "n07745940",
+    "n07747607",
+    "n07749582",
+    "n07753113",
+    "n07753275",
+    "n07753592",
+    "n07754684",
+    "n07760859",
+    "n07768694",
+    "n07802026",
+    "n07831146",
+    "n07836838",
+    "n07860988",
+    "n07871810",
+    "n07873807",
+    "n07875152",
+    "n07880968",
+    "n07892512",
+    "n07920052",
+    "n07930864",
+    "n07932039",
+    "n09193705",
+    "n09229709",
+    "n09246464",
+    "n09256479",
+    "n09288635",
+    "n09332890",
+    "n09399592",
+    "n09421951",
+    "n09428293",
+    "n09468604",
+    "n09472597",
+    "n09835506",
+    "n10148035",
+    "n10565667",
+    "n11879895",
+    "n11939491",
+    "n12057211",
+    "n12144580",
+    "n12267677",
+    "n12620546",
+    "n12768682",
+    "n12985857",
+    "n12998815",
+    "n13037406",
+    "n13040303",
+    "n13044778",
+    "n13052670",
+    "n13054560",
+    "n13133613",
+    "n15075141"
+  ],
+  "class_names": [
+    "tench, Tinca tinca",
+    "goldfish, Carassius auratus",
+    "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
+    "tiger shark, Galeocerdo cuvieri",
+    "hammerhead, hammerhead shark",
+    "electric ray, crampfish, numbfish, torpedo",
+    "stingray",
+    "cock",
+    "hen",
+    "ostrich, Struthio camelus",
+    "brambling, Fringilla montifringilla",
+    "goldfinch, Carduelis carduelis",
+    "house finch, linnet, Carpodacus mexicanus",
+    "junco, snowbird",
+    "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
+    "robin, American robin, Turdus migratorius",
+    "bulbul",
+    "jay",
+    "magpie",
+    "chickadee",
+    "water ouzel, dipper",
+    "kite",
+    "bald eagle, American eagle, Haliaeetus leucocephalus",
+    "vulture",
+    "great grey owl, great gray owl, Strix nebulosa",
+    "European fire salamander, Salamandra salamandra",
+    "common newt, Triturus vulgaris",
+    "eft",
+    "spotted salamander, Ambystoma maculatum",
+    "axolotl, mud puppy, Ambystoma mexicanum",
+    "bullfrog, Rana catesbeiana",
+    "tree frog, tree-frog",
+    "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
+    "loggerhead, loggerhead turtle, Caretta caretta",
+    "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",
+    "mud turtle",
+    "terrapin",
+    "box turtle, box tortoise",
+    "banded gecko",
+    "common iguana, iguana, Iguana iguana",
+    "American chameleon, anole, Anolis carolinensis",
+    "whiptail, whiptail lizard",
+    "agama",
+    "frilled lizard, Chlamydosaurus kingi",
+    "alligator lizard",
+    "Gila monster, Heloderma suspectum",
+    "green lizard, Lacerta viridis",
+    "African chameleon, Chamaeleo chamaeleon",
+    "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis",
+    "African crocodile, Nile crocodile, Crocodylus niloticus",
+    "American alligator, Alligator mississipiensis",
+    "triceratops",
+    "thunder snake, worm snake, Carphophis amoenus",
+    "ringneck snake, ring-necked snake, ring snake",
+    "hognose snake, puff adder, sand viper",
+    "green snake, grass snake",
+    "king snake, kingsnake",
+    "garter snake, grass snake",
+    "water snake",
+    "vine snake",
+    "night snake, Hypsiglena torquata",
+    "boa constrictor, Constrictor constrictor",
+    "rock python, rock snake, Python sebae",
+    "Indian cobra, Naja naja",
+    "green mamba",
+    "sea snake",
+    "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
+    "diamondback, diamondback rattlesnake, Crotalus adamanteus",
+    "sidewinder, horned rattlesnake, Crotalus cerastes",
+    "trilobite",
+    "harvestman, daddy longlegs, Phalangium opilio",
+    "scorpion",
+    "black and gold garden spider, Argiope aurantia",
+    "barn spider, Araneus cavaticus",
+    "garden spider, Aranea diademata",
+    "black widow, Latrodectus mactans",
+    "tarantula",
+    "wolf spider, hunting spider",
+    "tick",
+    "centipede",
+    "black grouse",
+    "ptarmigan",
+    "ruffed grouse, partridge, Bonasa umbellus",
+    "prairie chicken, prairie grouse, prairie fowl",
+    "peacock",
+    "quail",
+    "partridge",
+    "African grey, African gray, Psittacus erithacus",
+    "macaw",
+    "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
+    "lorikeet",
+    "coucal",
+    "bee eater",
+    "hornbill",
+    "hummingbird",
+    "jacamar",
+    "toucan",
+    "drake",
+    "red-breasted merganser, Mergus serrator",
+    "goose",
+    "black swan, Cygnus atratus",
+    "tusker",
+    "echidna, spiny anteater, anteater",
+    "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus",
+    "wallaby, brush kangaroo",
+    "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",
+    "wombat",
+    "jellyfish",
+    "sea anemone, anemone",
+    "brain coral",
+    "flatworm, platyhelminth",
+    "nematode, nematode worm, roundworm",
+    "conch",
+    "snail",
+    "slug",
+    "sea slug, nudibranch",
+    "chiton, coat-of-mail shell, sea cradle, polyplacophore",
+    "chambered nautilus, pearly nautilus, nautilus",
+    "Dungeness crab, Cancer magister",
+    "rock crab, Cancer irroratus",
+    "fiddler crab",
+    "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica",
+    "American lobster, Northern lobster, Maine lobster, Homarus americanus",
+    "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",
+    "crayfish, crawfish, crawdad, crawdaddy",
+    "hermit crab",
+    "isopod",
+    "white stork, Ciconia ciconia",
+    "black stork, Ciconia nigra",
+    "spoonbill",
+    "flamingo",
+    "little blue heron, Egretta caerulea",
+    "American egret, great white heron, Egretta albus",
+    "bittern",
+    "crane",
+    "limpkin, Aramus pictus",
+    "European gallinule, Porphyrio porphyrio",
+    "American coot, marsh hen, mud hen, water hen, Fulica americana",
+    "bustard",
+    "ruddy turnstone, Arenaria interpres",
+    "red-backed sandpiper, dunlin, Erolia alpina",
+    "redshank, Tringa totanus",
+    "dowitcher",
+    "oystercatcher, oyster catcher",
+    "pelican",
+    "king penguin, Aptenodytes patagonica",
+    "albatross, mollymawk",
+    "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus",
+    "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
+    "dugong, Dugong dugon",
+    "sea lion",
+    "Chihuahua",
+    "Japanese spaniel",
+    "Maltese dog, Maltese terrier, Maltese",
+    "Pekinese, Pekingese, Peke",
+    "Shih-Tzu",
+    "Blenheim spaniel",
+    "papillon",
+    "toy terrier",
+    "Rhodesian ridgeback",
+    "Afghan hound, Afghan",
+    "basset, basset hound",
+    "beagle",
+    "bloodhound, sleuthhound",
+    "bluetick",
+    "black-and-tan coonhound",
+    "Walker hound, Walker foxhound",
+    "English foxhound",
+    "redbone",
+    "borzoi, Russian wolfhound",
+    "Irish wolfhound",
+    "Italian greyhound",
+    "whippet",
+    "Ibizan hound, Ibizan Podenco",
+    "Norwegian elkhound, elkhound",
+    "otterhound, otter hound",
+    "Saluki, gazelle hound",
+    "Scottish deerhound, deerhound",
+    "Weimaraner",
+    "Staffordshire bullterrier, Staffordshire bull terrier",
+    "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier",
+    "Bedlington terrier",
+    "Border terrier",
+    "Kerry blue terrier",
+    "Irish terrier",
+    "Norfolk terrier",
+    "Norwich terrier",
+    "Yorkshire terrier",
+    "wire-haired fox terrier",
+    "Lakeland terrier",
+    "Sealyham terrier, Sealyham",
+    "Airedale, Airedale terrier",
+    "cairn, cairn terrier",
+    "Australian terrier",
+    "Dandie Dinmont, Dandie Dinmont terrier",
+    "Boston bull, Boston terrier",
+    "miniature schnauzer",
+    "giant schnauzer",
+    "standard schnauzer",
+    "Scotch terrier, Scottish terrier, Scottie",
+    "Tibetan terrier, chrysanthemum dog",
+    "silky terrier, Sydney silky",
+    "soft-coated wheaten terrier",
+    "West Highland white terrier",
+    "Lhasa, Lhasa apso",
+    "flat-coated retriever",
+    "curly-coated retriever",
+    "golden retriever",
+    "Labrador retriever",
+    "Chesapeake Bay retriever",
+    "German short-haired pointer",
+    "vizsla, Hungarian pointer",
+    "English setter",
+    "Irish setter, red setter",
+    "Gordon setter",
+    "Brittany spaniel",
+    "clumber, clumber spaniel",
+    "English springer, English springer spaniel",
+    "Welsh springer spaniel",
+    "cocker spaniel, English cocker spaniel, cocker",
+    "Sussex spaniel",
+    "Irish water spaniel",
+    "kuvasz",
+    "schipperke",
+    "groenendael",
+    "malinois",
+    "briard",
+    "kelpie",
+    "komondor",
+    "Old English sheepdog, bobtail",
+    "Shetland sheepdog, Shetland sheep dog, Shetland",
+    "collie",
+    "Border collie",
+    "Bouvier des Flandres, Bouviers des Flandres",
+    "Rottweiler",
+    "German shepherd, German shepherd dog, German police dog, alsatian",
+    "Doberman, Doberman pinscher",
+    "miniature pinscher",
+    "Greater Swiss Mountain dog",
+    "Bernese mountain dog",
+    "Appenzeller",
+    "EntleBucher",
+    "boxer",
+    "bull mastiff",
+    "Tibetan mastiff",
+    "French bulldog",
+    "Great Dane",
+    "Saint Bernard, St Bernard",
+    "Eskimo dog, husky",
+    "malamute, malemute, Alaskan malamute",
+    "Siberian husky",
+    "dalmatian, coach dog, carriage dog",
+    "affenpinscher, monkey pinscher, monkey dog",
+    "basenji",
+    "pug, pug-dog",
+    "Leonberg",
+    "Newfoundland, Newfoundland dog",
+    "Great Pyrenees",
+    "Samoyed, Samoyede",
+    "Pomeranian",
+    "chow, chow chow",
+    "keeshond",
+    "Brabancon griffon",
+    "Pembroke, Pembroke Welsh corgi",
+    "Cardigan, Cardigan Welsh corgi",
+    "toy poodle",
+    "miniature poodle",
+    "standard poodle",
+    "Mexican hairless",
+    "timber wolf, grey wolf, gray wolf, Canis lupus",
+    "white wolf, Arctic wolf, Canis lupus tundrarum",
+    "red wolf, maned wolf, Canis rufus, Canis niger",
+    "coyote, prairie wolf, brush wolf, Canis latrans",
+    "dingo, warrigal, warragal, Canis dingo",
+    "dhole, Cuon alpinus",
+    "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
+    "hyena, hyaena",
+    "red fox, Vulpes vulpes",
+    "kit fox, Vulpes macrotis",
+    "Arctic fox, white fox, Alopex lagopus",
+    "grey fox, gray fox, Urocyon cinereoargenteus",
+    "tabby, tabby cat",
+    "tiger cat",
+    "Persian cat",
+    "Siamese cat, Siamese",
+    "Egyptian cat",
+    "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",
+    "lynx, catamount",
+    "leopard, Panthera pardus",
+    "snow leopard, ounce, Panthera uncia",
+    "jaguar, panther, Panthera onca, Felis onca",
+    "lion, king of beasts, Panthera leo",
+    "tiger, Panthera tigris",
+    "cheetah, chetah, Acinonyx jubatus",
+    "brown bear, bruin, Ursus arctos",
+    "American black bear, black bear, Ursus americanus, Euarctos americanus",
+    "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
+    "sloth bear, Melursus ursinus, Ursus ursinus",
+    "mongoose",
+    "meerkat, mierkat",
+    "tiger beetle",
+    "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
+    "ground beetle, carabid beetle",
+    "long-horned beetle, longicorn, longicorn beetle",
+    "leaf beetle, chrysomelid",
+    "dung beetle",
+    "rhinoceros beetle",
+    "weevil",
+    "fly",
+    "bee",
+    "ant, emmet, pismire",
+    "grasshopper, hopper",
+    "cricket",
+    "walking stick, walkingstick, stick insect",
+    "cockroach, roach",
+    "mantis, mantid",
+    "cicada, cicala",
+    "leafhopper",
+    "lacewing, lacewing fly",
+    "dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
+    "damselfly",
+    "admiral",
+    "ringlet, ringlet butterfly",
+    "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
+    "cabbage butterfly",
+    "sulphur butterfly, sulfur butterfly",
+    "lycaenid, lycaenid butterfly",
+    "starfish, sea star",
+    "sea urchin",
+    "sea cucumber, holothurian",
+    "wood rabbit, cottontail, cottontail rabbit",
+    "hare",
+    "Angora, Angora rabbit",
+    "hamster",
+    "porcupine, hedgehog",
+    "fox squirrel, eastern fox squirrel, Sciurus niger",
+    "marmot",
+    "beaver",
+    "guinea pig, Cavia cobaya",
+    "sorrel",
+    "zebra",
+    "hog, pig, grunter, squealer, Sus scrofa",
+    "wild boar, boar, Sus scrofa",
+    "warthog",
+    "hippopotamus, hippo, river horse, Hippopotamus amphibius",
+    "ox",
+    "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
+    "bison",
+    "ram, tup",
+    "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis",
+    "ibex, Capra ibex",
+    "hartebeest",
+    "impala, Aepyceros melampus",
+    "gazelle",
+    "Arabian camel, dromedary, Camelus dromedarius",
+    "llama",
+    "weasel",
+    "mink",
+    "polecat, fitch, foulmart, foumart, Mustela putorius",
+    "black-footed ferret, ferret, Mustela nigripes",
+    "otter",
+    "skunk, polecat, wood pussy",
+    "badger",
+    "armadillo",
+    "three-toed sloth, ai, Bradypus tridactylus",
+    "orangutan, orang, orangutang, Pongo pygmaeus",
+    "gorilla, Gorilla gorilla",
+    "chimpanzee, chimp, Pan troglodytes",
+    "gibbon, Hylobates lar",
+    "siamang, Hylobates syndactylus, Symphalangus syndactylus",
+    "guenon, guenon monkey",
+    "patas, hussar monkey, Erythrocebus patas",
+    "baboon",
+    "macaque",
+    "langur",
+    "colobus, colobus monkey",
+    "proboscis monkey, Nasalis larvatus",
+    "marmoset",
+    "capuchin, ringtail, Cebus capucinus",
+    "howler monkey, howler",
+    "titi, titi monkey",
+    "spider monkey, Ateles geoffroyi",
+    "squirrel monkey, Saimiri sciureus",
+    "Madagascar cat, ring-tailed lemur, Lemur catta",
+    "indri, indris, Indri indri, Indri brevicaudatus",
+    "Indian elephant, Elephas maximus",
+    "African elephant, Loxodonta africana",
+    "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
+    "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
+    "barracouta, snoek",
+    "eel",
+    "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",
+    "rock beauty, Holocanthus tricolor",
+    "anemone fish",
+    "sturgeon",
+    "gar, garfish, garpike, billfish, Lepisosteus osseus",
+    "lionfish",
+    "puffer, pufferfish, blowfish, globefish",
+    "abacus",
+    "abaya",
+    "academic gown, academic robe, judge's robe",
+    "accordion, piano accordion, squeeze box",
+    "acoustic guitar",
+    "aircraft carrier, carrier, flattop, attack aircraft carrier",
+    "airliner",
+    "airship, dirigible",
+    "altar",
+    "ambulance",
+    "amphibian, amphibious vehicle",
+    "analog clock",
+    "apiary, bee house",
+    "apron",
+    "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
+    "assault rifle, assault gun",
+    "backpack, back pack, knapsack, packsack, rucksack, haversack",
+    "bakery, bakeshop, bakehouse",
+    "balance beam, beam",
+    "balloon",
+    "ballpoint, ballpoint pen, ballpen, Biro",
+    "Band Aid",
+    "banjo",
+    "bannister, banister, balustrade, balusters, handrail",
+    "barbell",
+    "barber chair",
+    "barbershop",
+    "barn",
+    "barometer",
+    "barrel, cask",
+    "barrow, garden cart, lawn cart, wheelbarrow",
+    "baseball",
+    "basketball",
+    "bassinet",
+    "bassoon",
+    "bathing cap, swimming cap",
+    "bath towel",
+    "bathtub, bathing tub, bath, tub",
+    "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon",
+    "beacon, lighthouse, beacon light, pharos",
+    "beaker",
+    "bearskin, busby, shako",
+    "beer bottle",
+    "beer glass",
+    "bell cote, bell cot",
+    "bib",
+    "bicycle-built-for-two, tandem bicycle, tandem",
+    "bikini, two-piece",
+    "binder, ring-binder",
+    "binoculars, field glasses, opera glasses",
+    "birdhouse",
+    "boathouse",
+    "bobsled, bobsleigh, bob",
+    "bolo tie, bolo, bola tie, bola",
+    "bonnet, poke bonnet",
+    "bookcase",
+    "bookshop, bookstore, bookstall",
+    "bottlecap",
+    "bow",
+    "bow tie, bow-tie, bowtie",
+    "brass, memorial tablet, plaque",
+    "brassiere, bra, bandeau",
+    "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
+    "breastplate, aegis, egis",
+    "broom",
+    "bucket, pail",
+    "buckle",
+    "bulletproof vest",
+    "bullet train, bullet",
+    "butcher shop, meat market",
+    "cab, hack, taxi, taxicab",
+    "caldron, cauldron",
+    "candle, taper, wax light",
+    "cannon",
+    "canoe",
+    "can opener, tin opener",
+    "cardigan",
+    "car mirror",
+    "carousel, carrousel, merry-go-round, roundabout, whirligig",
+    "carpenter's kit, tool kit",
+    "carton",
+    "car wheel",
+    "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM",
+    "cassette",
+    "cassette player",
+    "castle",
+    "catamaran",
+    "CD player",
+    "cello, violoncello",
+    "cellular telephone, cellular phone, cellphone, cell, mobile phone",
+    "chain",
+    "chainlink fence",
+    "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour",
+    "chain saw, chainsaw",
+    "chest",
+    "chiffonier, commode",
+    "chime, bell, gong",
+    "china cabinet, china closet",
+    "Christmas stocking",
+    "church, church building",
+    "cinema, movie theater, movie theatre, movie house, picture palace",
+    "cleaver, meat cleaver, chopper",
+    "cliff dwelling",
+    "cloak",
+    "clog, geta, patten, sabot",
+    "cocktail shaker",
+    "coffee mug",
+    "coffeepot",
+    "coil, spiral, volute, whorl, helix",
+    "combination lock",
+    "computer keyboard, keypad",
+    "confectionery, confectionary, candy store",
+    "container ship, containership, container vessel",
+    "convertible",
+    "corkscrew, bottle screw",
+    "cornet, horn, trumpet, trump",
+    "cowboy boot",
+    "cowboy hat, ten-gallon hat",
+    "cradle",
+    "crane",
+    "crash helmet",
+    "crate",
+    "crib, cot",
+    "Crock Pot",
+    "croquet ball",
+    "crutch",
+    "cuirass",
+    "dam, dike, dyke",
+    "desk",
+    "desktop computer",
+    "dial telephone, dial phone",
+    "diaper, nappy, napkin",
+    "digital clock",
+    "digital watch",
+    "dining table, board",
+    "dishrag, dishcloth",
+    "dishwasher, dish washer, dishwashing machine",
+    "disk brake, disc brake",
+    "dock, dockage, docking facility",
+    "dogsled, dog sled, dog sleigh",
+    "dome",
+    "doormat, welcome mat",
+    "drilling platform, offshore rig",
+    "drum, membranophone, tympan",
+    "drumstick",
+    "dumbbell",
+    "Dutch oven",
+    "electric fan, blower",
+    "electric guitar",
+    "electric locomotive",
+    "entertainment center",
+    "envelope",
+    "espresso maker",
+    "face powder",
+    "feather boa, boa",
+    "file, file cabinet, filing cabinet",
+    "fireboat",
+    "fire engine, fire truck",
+    "fire screen, fireguard",
+    "flagpole, flagstaff",
+    "flute, transverse flute",
+    "folding chair",
+    "football helmet",
+    "forklift",
+    "fountain",
+    "fountain pen",
+    "four-poster",
+    "freight car",
+    "French horn, horn",
+    "frying pan, frypan, skillet",
+    "fur coat",
+    "garbage truck, dustcart",
+    "gasmask, respirator, gas helmet",
+    "gas pump, gasoline pump, petrol pump, island dispenser",
+    "goblet",
+    "go-kart",
+    "golf ball",
+    "golfcart, golf cart",
+    "gondola",
+    "gong, tam-tam",
+    "gown",
+    "grand piano, grand",
+    "greenhouse, nursery, glasshouse",
+    "grille, radiator grille",
+    "grocery store, grocery, food market, market",
+    "guillotine",
+    "hair slide",
+    "hair spray",
+    "half track",
+    "hammer",
+    "hamper",
+    "hand blower, blow dryer, blow drier, hair dryer, hair drier",
+    "hand-held computer, hand-held microcomputer",
+    "handkerchief, hankie, hanky, hankey",
+    "hard disc, hard disk, fixed disk",
+    "harmonica, mouth organ, harp, mouth harp",
+    "harp",
+    "harvester, reaper",
+    "hatchet",
+    "holster",
+    "home theater, home theatre",
+    "honeycomb",
+    "hook, claw",
+    "hoopskirt, crinoline",
+    "horizontal bar, high bar",
+    "horse cart, horse-cart",
+    "hourglass",
+    "iPod",
+    "iron, smoothing iron",
+    "jack-o'-lantern",
+    "jean, blue jean, denim",
+    "jeep, landrover",
+    "jersey, T-shirt, tee shirt",
+    "jigsaw puzzle",
+    "jinrikisha, ricksha, rickshaw",
+    "joystick",
+    "kimono",
+    "knee pad",
+    "knot",
+    "lab coat, laboratory coat",
+    "ladle",
+    "lampshade, lamp shade",
+    "laptop, laptop computer",
+    "lawn mower, mower",
+    "lens cap, lens cover",
+    "letter opener, paper knife, paperknife",
+    "library",
+    "lifeboat",
+    "lighter, light, igniter, ignitor",
+    "limousine, limo",
+    "liner, ocean liner",
+    "lipstick, lip rouge",
+    "Loafer",
+    "lotion",
+    "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
+    "loupe, jeweler's loupe",
+    "lumbermill, sawmill",
+    "magnetic compass",
+    "mailbag, postbag",
+    "mailbox, letter box",
+    "maillot",
+    "maillot, tank suit",
+    "manhole cover",
+    "maraca",
+    "marimba, xylophone",
+    "mask",
+    "matchstick",
+    "maypole",
+    "maze, labyrinth",
+    "measuring cup",
+    "medicine chest, medicine cabinet",
+    "megalith, megalithic structure",
+    "microphone, mike",
+    "microwave, microwave oven",
+    "military uniform",
+    "milk can",
+    "minibus",
+    "miniskirt, mini",
+    "minivan",
+    "missile",
+    "mitten",
+    "mixing bowl",
+    "mobile home, manufactured home",
+    "Model T",
+    "modem",
+    "monastery",
+    "monitor",
+    "moped",
+    "mortar",
+    "mortarboard",
+    "mosque",
+    "mosquito net",
+    "motor scooter, scooter",
+    "mountain bike, all-terrain bike, off-roader",
+    "mountain tent",
+    "mouse, computer mouse",
+    "mousetrap",
+    "moving van",
+    "muzzle",
+    "nail",
+    "neck brace",
+    "necklace",
+    "nipple",
+    "notebook, notebook computer",
+    "obelisk",
+    "oboe, hautboy, hautbois",
+    "ocarina, sweet potato",
+    "odometer, hodometer, mileometer, milometer",
+    "oil filter",
+    "organ, pipe organ",
+    "oscilloscope, scope, cathode-ray oscilloscope, CRO",
+    "overskirt",
+    "oxcart",
+    "oxygen mask",
+    "packet",
+    "paddle, boat paddle",
+    "paddlewheel, paddle wheel",
+    "padlock",
+    "paintbrush",
+    "pajama, pyjama, pj's, jammies",
+    "palace",
+    "panpipe, pandean pipe, syrinx",
+    "paper towel",
+    "parachute, chute",
+    "parallel bars, bars",
+    "park bench",
+    "parking meter",
+    "passenger car, coach, carriage",
+    "patio, terrace",
+    "pay-phone, pay-station",
+    "pedestal, plinth, footstall",
+    "pencil box, pencil case",
+    "pencil sharpener",
+    "perfume, essence",
+    "Petri dish",
+    "photocopier",
+    "pick, plectrum, plectron",
+    "pickelhaube",
+    "picket fence, paling",
+    "pickup, pickup truck",
+    "pier",
+    "piggy bank, penny bank",
+    "pill bottle",
+    "pillow",
+    "ping-pong ball",
+    "pinwheel",
+    "pirate, pirate ship",
+    "pitcher, ewer",
+    "plane, carpenter's plane, woodworking plane",
+    "planetarium",
+    "plastic bag",
+    "plate rack",
+    "plow, plough",
+    "plunger, plumber's helper",
+    "Polaroid camera, Polaroid Land camera",
+    "pole",
+    "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",
+    "poncho",
+    "pool table, billiard table, snooker table",
+    "pop bottle, soda bottle",
+    "pot, flowerpot",
+    "potter's wheel",
+    "power drill",
+    "prayer rug, prayer mat",
+    "printer",
+    "prison, prison house",
+    "projectile, missile",
+    "projector",
+    "puck, hockey puck",
+    "punching bag, punch bag, punching ball, punchball",
+    "purse",
+    "quill, quill pen",
+    "quilt, comforter, comfort, puff",
+    "racer, race car, racing car",
+    "racket, racquet",
+    "radiator",
+    "radio, wireless",
+    "radio telescope, radio reflector",
+    "rain barrel",
+    "recreational vehicle, RV, R.V.",
+    "reel",
+    "reflex camera",
+    "refrigerator, icebox",
+    "remote control, remote",
+    "restaurant, eating house, eating place, eatery",
+    "revolver, six-gun, six-shooter",
+    "rifle",
+    "rocking chair, rocker",
+    "rotisserie",
+    "rubber eraser, rubber, pencil eraser",
+    "rugby ball",
+    "rule, ruler",
+    "running shoe",
+    "safe",
+    "safety pin",
+    "saltshaker, salt shaker",
+    "sandal",
+    "sarong",
+    "sax, saxophone",
+    "scabbard",
+    "scale, weighing machine",
+    "school bus",
+    "schooner",
+    "scoreboard",
+    "screen, CRT screen",
+    "screw",
+    "screwdriver",
+    "seat belt, seatbelt",
+    "sewing machine",
+    "shield, buckler",
+    "shoe shop, shoe-shop, shoe store",
+    "shoji",
+    "shopping basket",
+    "shopping cart",
+    "shovel",
+    "shower cap",
+    "shower curtain",
+    "ski",
+    "ski mask",
+    "sleeping bag",
+    "slide rule, slipstick",
+    "sliding door",
+    "slot, one-armed bandit",
+    "snorkel",
+    "snowmobile",
+    "snowplow, snowplough",
+    "soap dispenser",
+    "soccer ball",
+    "sock",
+    "solar dish, solar collector, solar furnace",
+    "sombrero",
+    "soup bowl",
+    "space bar",
+    "space heater",
+    "space shuttle",
+    "spatula",
+    "speedboat",
+    "spider web, spider's web",
+    "spindle",
+    "sports car, sport car",
+    "spotlight, spot",
+    "stage",
+    "steam locomotive",
+    "steel arch bridge",
+    "steel drum",
+    "stethoscope",
+    "stole",
+    "stone wall",
+    "stopwatch, stop watch",
+    "stove",
+    "strainer",
+    "streetcar, tram, tramcar, trolley, trolley car",
+    "stretcher",
+    "studio couch, day bed",
+    "stupa, tope",
+    "submarine, pigboat, sub, U-boat",
+    "suit, suit of clothes",
+    "sundial",
+    "sunglass",
+    "sunglasses, dark glasses, shades",
+    "sunscreen, sunblock, sun blocker",
+    "suspension bridge",
+    "swab, swob, mop",
+    "sweatshirt",
+    "swimming trunks, bathing trunks",
+    "swing",
+    "switch, electric switch, electrical switch",
+    "syringe",
+    "table lamp",
+    "tank, army tank, armored combat vehicle, armoured combat vehicle",
+    "tape player",
+    "teapot",
+    "teddy, teddy bear",
+    "television, television system",
+    "tennis ball",
+    "thatch, thatched roof",
+    "theater curtain, theatre curtain",
+    "thimble",
+    "thresher, thrasher, threshing machine",
+    "throne",
+    "tile roof",
+    "toaster",
+    "tobacco shop, tobacconist shop, tobacconist",
+    "toilet seat",
+    "torch",
+    "totem pole",
+    "tow truck, tow car, wrecker",
+    "toyshop",
+    "tractor",
+    "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",
+    "tray",
+    "trench coat",
+    "tricycle, trike, velocipede",
+    "trimaran",
+    "tripod",
+    "triumphal arch",
+    "trolleybus, trolley coach, trackless trolley",
+    "trombone",
+    "tub, vat",
+    "turnstile",
+    "typewriter keyboard",
+    "umbrella",
+    "unicycle, monocycle",
+    "upright, upright piano",
+    "vacuum, vacuum cleaner",
+    "vase",
+    "vault",
+    "velvet",
+    "vending machine",
+    "vestment",
+    "viaduct",
+    "violin, fiddle",
+    "volleyball",
+    "waffle iron",
+    "wall clock",
+    "wallet, billfold, notecase, pocketbook",
+    "wardrobe, closet, press",
+    "warplane, military plane",
+    "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
+    "washer, automatic washer, washing machine",
+    "water bottle",
+    "water jug",
+    "water tower",
+    "whiskey jug",
+    "whistle",
+    "wig",
+    "window screen",
+    "window shade",
+    "Windsor tie",
+    "wine bottle",
+    "wing",
+    "wok",
+    "wooden spoon",
+    "wool, woolen, woollen",
+    "worm fence, snake fence, snake-rail fence, Virginia fence",
+    "wreck",
+    "yawl",
+    "yurt",
+    "web site, website, internet site, site",
+    "comic book",
+    "crossword puzzle, crossword",
+    "street sign",
+    "traffic light, traffic signal, stoplight",
+    "book jacket, dust cover, dust jacket, dust wrapper",
+    "menu",
+    "plate",
+    "guacamole",
+    "consomme",
+    "hot pot, hotpot",
+    "trifle",
+    "ice cream, icecream",
+    "ice lolly, lolly, lollipop, popsicle",
+    "French loaf",
+    "bagel, beigel",
+    "pretzel",
+    "cheeseburger",
+    "hotdog, hot dog, red hot",
+    "mashed potato",
+    "head cabbage",
+    "broccoli",
+    "cauliflower",
+    "zucchini, courgette",
+    "spaghetti squash",
+    "acorn squash",
+    "butternut squash",
+    "cucumber, cuke",
+    "artichoke, globe artichoke",
+    "bell pepper",
+    "cardoon",
+    "mushroom",
+    "Granny Smith",
+    "strawberry",
+    "orange",
+    "lemon",
+    "fig",
+    "pineapple, ananas",
+    "banana",
+    "jackfruit, jak, jack",
+    "custard apple",
+    "pomegranate",
+    "hay",
+    "carbonara",
+    "chocolate sauce, chocolate syrup",
+    "dough",
+    "meat loaf, meatloaf",
+    "pizza, pizza pie",
+    "potpie",
+    "burrito",
+    "red wine",
+    "espresso",
+    "cup",
+    "eggnog",
+    "alp",
+    "bubble",
+    "cliff, drop, drop-off",
+    "coral reef",
+    "geyser",
+    "lakeside, lakeshore",
+    "promontory, headland, head, foreland",
+    "sandbar, sand bar",
+    "seashore, coast, seacoast, sea-coast",
+    "valley, vale",
+    "volcano",
+    "ballplayer, baseball player",
+    "groom, bridegroom",
+    "scuba diver",
+    "rapeseed",
+    "daisy",
+    "yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
+    "corn",
+    "acorn",
+    "hip, rose hip, rosehip",
+    "buckeye, horse chestnut, conker",
+    "coral fungus",
+    "agaric",
+    "gyromitra",
+    "stinkhorn, carrion fungus",
+    "earthstar",
+    "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",
+    "bolete",
+    "ear, spike, capitulum",
+    "toilet tissue, toilet paper, bathroom tissue"
+  ],
+  "detection_num_classes": 80,
+  "torch_dtype": "float32",
+  "detection_hidden": 160,
+  "detection_n_std_layers": 5,
+  "detection_n_dw_layers": 4,
+  "detection_n_scales": 4,
+  "detection_pos_emb_dim": 64,
+  "detection_text_embed_dim": 768
+}

model.bf16_backbone.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ccd2f51f285eef54c7b1466e92d3da83c13f410e9476db70b614663f2825fe9a
+size 240885372

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1837c48d63921ddc18e9fefa5adcf8292ab36728974bb7c9f14c16b1f71ea3d0
+size 412169724

rf100vl_zero_shot_cross_domain_eval.json ADDED Viewed

	@@ -0,0 +1,499 @@

+{
+  "picker": {
+    "path": "/mnt/d/detection-heads/heads/cofiber_threshold/split_tower_5scale_160h_5std_4dw_ema_l14_16ep_768_cls_calib/checkpoint_final.pth",
+    "n_params": 2975067,
+    "text_embed_dim": 768
+  },
+  "fcos": {
+    "path": "/mnt/d/_tmp/argus_fcos_head.pth",
+    "n_params": 16138074
+  },
+  "cache_dir": "/home/zootest/datasets/rf100vl_val_cache_768",
+  "resolution": 768,
+  "score_thresh": 0.05,
+  "max_per_image": 100,
+  "domains": [
+    {
+      "domain": "actions",
+      "n_items": 818,
+      "n_domain_classes": 6,
+      "domain_class_names": [
+        "Attack",
+        "Block",
+        "Defense",
+        "Serve",
+        "Set",
+        "ball"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.3958944594145987
+        },
+        "fcos": {
+          "AR@100": 0.3750611259658937
+        },
+        "delta_AR@100": 0.020833333448705027
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "aerial-airport",
+      "n_items": 66,
+      "n_domain_classes": 1,
+      "domain_class_names": [
+        "airplane"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.17270012670337706
+        },
+        "fcos": {
+          "AR@100": 0.16126021817890984
+        },
+        "delta_AR@100": 0.011439908524467218
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "all-elements",
+      "n_items": 142,
+      "n_domain_classes": 10,
+      "domain_class_names": [
+        "Button",
+        "Check box",
+        "Checked Radio button",
+        "Checked box",
+        "Dropdown box",
+        "Dropdown expand",
+        "Icon",
+        "Radio button",
+        "Scroll bar",
+        "Text box"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.07888318364128051
+        },
+        "fcos": {
+          "AR@100": 0.02245109092417947
+        },
+        "delta_AR@100": 0.05643209271710104
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "aquarium-combined",
+      "n_items": 127,
+      "n_domain_classes": 7,
+      "domain_class_names": [
+        "fish",
+        "jellyfish",
+        "penguin",
+        "puffin",
+        "shark",
+        "starfish",
+        "stingray"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.5815294733842996
+        },
+        "fcos": {
+          "AR@100": 0.4753463245165629
+        },
+        "delta_AR@100": 0.10618314886773672
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "defect-detection",
+      "n_items": 375,
+      "n_domain_classes": 4,
+      "domain_class_names": [
+        "defective fishplate",
+        "fastener",
+        "missing fastener",
+        "non defective fishplate"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.0029333333333333334
+        },
+        "fcos": {
+          "AR@100": 0.0010666666666666667
+        },
+        "delta_AR@100": 0.0018666666666666666
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "dentalai",
+      "n_items": 253,
+      "n_domain_classes": 4,
+      "domain_class_names": [
+        "Cavity",
+        "Fillings",
+        "Impacted Tooth",
+        "Implant"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.00916757190769369
+        },
+        "fcos": {
+          "AR@100": 0.0018475266961240958
+        },
+        "delta_AR@100": 0.007320045211569594
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "flir-camera-objects",
+      "n_items": 2513,
+      "n_domain_classes": 4,
+      "domain_class_names": [
+        "bicycle",
+        "car",
+        "dog",
+        "person"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.5425524384191743
+        },
+        "fcos": {
+          "AR@100": 0.5309809631263704
+        },
+        "delta_AR@100": 0.01157147529280389
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "gwhd2021",
+      "n_items": 1278,
+      "n_domain_classes": 1,
+      "domain_class_names": [
+        "whd"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.0147656631658906
+        },
+        "fcos": {
+          "AR@100": 0.017273207562968296
+        },
+        "delta_AR@100": -0.002507544397077696
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "lacrosse-object-detection",
+      "n_items": 100,
+      "n_domain_classes": 4,
+      "domain_class_names": [
+        "Goalie",
+        "Longpole",
+        "Referee",
+        "Shortstick"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.6658460140824318
+        },
+        "fcos": {
+          "AR@100": 0.5788187006562948
+        },
+        "delta_AR@100": 0.08702731342613701
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "new-defects-in-wood",
+      "n_items": 253,
+      "n_domain_classes": 5,
+      "domain_class_names": [
+        "Crack",
+        "Dead knot",
+        "Holes",
+        "Live knot",
+        "knot with crack"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.14613965756219366
+        },
+        "fcos": {
+          "AR@100": 0.056389986832622495
+        },
+        "delta_AR@100": 0.08974967072957116
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "orionproducts",
+      "n_items": 117,
+      "n_domain_classes": 8,
+      "domain_class_names": [
+        "Candy Boom",
+        "Chocopie Dark",
+        "Chocopie Nor",
+        "Marine Boy",
+        "OStar Red",
+        "OStar Yellow",
+        "Swing Maxx",
+        "Swing Nor"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.25509543324000816
+        },
+        "fcos": {
+          "AR@100": 0.17056578116921278
+        },
+        "delta_AR@100": 0.08452965207079538
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "paper-parts",
+      "n_items": 2407,
+      "n_domain_classes": 19,
+      "domain_class_names": [
+        "author",
+        "chapter",
+        "equation",
+        "equation number",
+        "figure",
+        "figure caption",
+        "footnote",
+        "list of content heading",
+        "list of content text",
+        "page number",
+        "paragraph",
+        "reference text",
+        "section",
+        "subsection",
+        "subsubsection",
+        "table",
+        "table caption",
+        "table of contents text",
+        "title"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.22175769264448789
+        },
+        "fcos": {
+          "AR@100": 0.19341745339612595
+        },
+        "delta_AR@100": 0.02834023924836193
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "recode-waste",
+      "n_items": 500,
+      "n_domain_classes": 6,
+      "domain_class_names": [
+        "aggregate",
+        "cardboard",
+        "hard plastic",
+        "metal",
+        "soft plastic",
+        "timber"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.11759555092621594
+        },
+        "fcos": {
+          "AR@100": 0.11386011148523538
+        },
+        "delta_AR@100": 0.0037354394409805647
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "soda-bottles",
+      "n_items": 449,
+      "n_domain_classes": 3,
+      "domain_class_names": [
+        "coca-cola",
+        "fanta",
+        "sprite"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.3584433421534061
+        },
+        "fcos": {
+          "AR@100": 0.29591111459858493
+        },
+        "delta_AR@100": 0.06253222755482118
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "the-dreidel-project",
+      "n_items": 108,
+      "n_domain_classes": 6,
+      "domain_class_names": [
+        "Dreidel",
+        "Gimel",
+        "Hay",
+        "Nun",
+        "Shin",
+        "Spinning Dreidel"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.6514730654725874
+        },
+        "fcos": {
+          "AR@100": 0.5772390587393332
+        },
+        "delta_AR@100": 0.0742340067332542
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "trail-camera",
+      "n_items": 261,
+      "n_domain_classes": 2,
+      "domain_class_names": [
+        "Deer",
+        "Hog"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.6955832277392519
+        },
+        "fcos": {
+          "AR@100": 0.6009897838607146
+        },
+        "delta_AR@100": 0.0945934438785373
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "water-meter",
+      "n_items": 136,
+      "n_domain_classes": 10,
+      "domain_class_names": [
+        "0",
+        "1",
+        "2",
+        "3",
+        "4",
+        "5",
+        "6",
+        "7",
+        "8",
+        "9"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.0003676470588235294
+        },
+        "fcos": {
+          "AR@100": 0.00665441194877905
+        },
+        "delta_AR@100": -0.006286764889955521
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "wb-prova",
+      "n_items": 289,
+      "n_domain_classes": 3,
+      "domain_class_names": [
+        "Adult",
+        "Juvenile",
+        "Piglet"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.8620688187519159
+        },
+        "fcos": {
+          "AR@100": 0.8358979204224879
+        },
+        "delta_AR@100": 0.026170898329427983
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "wildfire-smoke",
+      "n_items": 147,
+      "n_domain_classes": 1,
+      "domain_class_names": [
+        "smoke"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.005442176870748299
+        },
+        "fcos": {
+          "AR@100": 0.0034013605442176865
+        },
+        "delta_AR@100": 0.0020408163265306124
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    },
+    {
+      "domain": "x-ray-id",
+      "n_items": 767,
+      "n_domain_classes": 6,
+      "domain_class_names": [
+        "DIP",
+        "MCP",
+        "PIP",
+        "Radius",
+        "Ulna",
+        "Wrist"
+      ],
+      "mode_a_class_agnostic": {
+        "picker": {
+          "AR@100": 0.0
+        },
+        "fcos": {
+          "AR@100": 1.5338599642998546e-05
+        },
+        "delta_AR@100": -1.5338599642998546e-05
+      },
+      "mode_b_text_swap_picker": null,
+      "mode_b_error": "skipped (--skip-text-swap)"
+    }
+  ],
+  "aggregate": {
+    "n_domains": 20,
+    "mode_a_picker_AR100_mean": 0.2889119438235859,
+    "mode_a_fcos_AR100_mean": 0.25092240729454635,
+    "mode_a_delta_mean": 0.037989536529039566,
+    "mode_a_picker_wins": 17,
+    "mode_a_fcos_wins": 3,
+    "mode_a_ties": 0,
+    "mode_b_n_domains_valid": 0,
+    "mode_b_picker_text_swap_mAP_mean": 0.0,
+    "mode_b_picker_text_swap_mAP_median": 0.0
+  }
+}