Spaces:

derektan95
/

search-tta-demo

Running on Zero

App Files Files Community

derektan commited on May 23

Commit

dd3c1c5

1 Parent(s): 56e7382

First commit. Using Git LFS for binaries

Browse files

Files changed (7) hide show

.gitattributes +3 -0
.gitignore +1 -0
app.py +271 -0
clip_vision_per_patch_model.py +26 -0
examples/NAIP_yosemite_v3_resized.png +3 -0
examples/american_black_bear_inat_248820933.jpeg +3 -0
requirements.txt +11 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ **/__pycache__/

app.py ADDED Viewed

	@@ -0,0 +1,271 @@

+"""
+EcoMonitor • multimodal heat-map demo  (with custom preprocessing)
+"""
+# ────────────────────────── imports ───────────────────────────────────
+import cv2
+import gradio as gr
+import torch
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+import io
+from torchvision import transforms
+import open_clip
+from clip_vision_per_patch_model import CLIPVisionPerPatchModel
+# ────────────────────────── global config & models ────────────────────
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# 1️⃣  BioCLIP   (ground-image & text encoder)
+bio_model, _, _ = open_clip.create_model_and_transforms("hf-hub:imageomics/bioclip")
+bio_model = bio_model.to(device).eval()
+bio_tokenizer = open_clip.get_tokenizer("hf-hub:imageomics/bioclip")
+# 2️⃣  Satellite patch encoder  (CLIP-L-336 per-patch)
+sat_model: CLIPVisionPerPatchModel = (
+    CLIPVisionPerPatchModel.from_pretrained("derektan95/search-tta")
+    .to(device)
+    .eval()
+)
+logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+logit_scale = logit_scale.exp()
+blur_kernel = (5,5)
+# ────────────────────────── transforms (exact spec) ───────────────────
+img_transform = transforms.Compose(
+    [
+        transforms.Resize((256, 256)),
+        transforms.CenterCrop((224, 224)),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            mean=[0.485, 0.456, 0.406],
+            std=[0.229, 0.224, 0.225],
+        ),
+    ]
+)
+imo_transform = transforms.Compose(
+    [
+        transforms.Resize((336, 336)),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            mean=[0.485, 0.456, 0.406],
+            std=[0.229, 0.224, 0.225],
+        ),
+    ]
+)
+# ────────────────────────── helpers ───────────────────────────────────
+# def _tensor_ground(img_pil: Image.Image) -> torch.Tensor:
+#     return img_transform(img_pil).unsqueeze(0).to(device)
+# def _tensor_sat(img_pil: Image.Image) -> torch.Tensor:
+#     return imo_transform(img_pil).unsqueeze(0).to(device)
+@torch.no_grad()
+def _encode_ground(img_pil: Image.Image) -> torch.Tensor:
+    img = img_transform(img_pil).unsqueeze(0).to(device)
+    img_embeds, *_ = bio_model(img)
+    return img_embeds
+    # feats = bio_model.encode_image(_tensor_ground(img_pil))
+    # return torch.nn.functional.normalize(feats, dim=-1)
+@torch.no_grad()
+def _encode_text(text: str) -> torch.Tensor:
+    toks = bio_tokenizer(text).to(device)
+    _, txt_embeds, _ = bio_model(text=toks)
+    return txt_embeds
+    # return torch.nn.functional.normalize(feats, dim=-1)
+@torch.no_grad()
+def _encode_sat(img_pil: Image.Image) -> torch.Tensor:
+    imo = imo_transform(img_pil).unsqueeze(0).to(device)
+    imo_embeds = sat_model(imo)
+    return imo_embeds
+    # out = sat_model(_tensor_sat(img_pil))
+    # if hasattr(out, "last_hidden_state"):
+    #     out = out.last_hidden_state
+    # return torch.nn.functional.normalize(out.squeeze(0), dim=-1)  # (P, D)
+    # return out
+def _similarity_heatmap(query: torch.Tensor, patches: torch.Tensor) -> np.ndarray:
+    sims = torch.matmul(query, patches.t()) * logit_scale
+    sims = sims.t().sigmoid()
+    # sims = torch.sigmoid(patches @ query.squeeze(0))  # (P,)
+    sims = sims[1:].squeeze()  # drop CLS token
+    side = int(np.sqrt(len(sims)))
+    sims = sims.reshape(side, side)
+    return sims.cpu().detach().numpy()
+    # return sims[: side * side].view(side, side).cpu().numpy()
+def _array_to_pil(arr: np.ndarray) -> Image.Image:
+    """
+    Render arr with viridis, automatically stretching its own min→max to 0→1
+    so that the most-similar patches appear yellow.
+    """
+    # Gausian Smoothing
+    if blur_kernel != (0,0):
+        arr = cv2.GaussianBlur(arr, blur_kernel, 0)
+    # --- contrast-stretch to local 0-1 range --------------------------
+    arr_min, arr_max = float(arr.min()), float(arr.max())
+    if arr_max - arr_min < 1e-6:        # avoid /0 when the heat-map is flat
+        arr_scaled = np.zeros_like(arr)
+    else:
+        arr_scaled = (arr - arr_min) / (arr_max - arr_min)
+    # ------------------------------------------------------------------
+    fig, ax = plt.subplots(figsize=(2.6, 2.6), dpi=96)
+    ax.imshow(arr_scaled, cmap="viridis", vmin=0.0, vmax=1.0)
+    ax.axis("off")
+    buf = io.BytesIO()
+    plt.tight_layout(pad=0)
+    fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
+    plt.close(fig)
+    buf.seek(0)
+    return Image.open(buf)
+# ────────────────────────── main inference ────────────────────────────
+def process(
+    sat_img: Image.Image,
+    taxonomy: str,
+    ground_img: Image.Image | None,
+):
+    if sat_img is None:
+        return None, None
+    patches = _encode_sat(sat_img)
+    heat_ground, heat_text = None, None
+    if ground_img is not None:
+        q_img = _encode_ground(ground_img)
+        heat_ground = _array_to_pil(_similarity_heatmap(q_img, patches))
+    if taxonomy.strip():
+        q_txt = _encode_text(taxonomy.strip())
+        heat_text = _array_to_pil(_similarity_heatmap(q_txt, patches))
+    return heat_ground, heat_text
+# ────────────────────────── Gradio UI ─────────────────────────────────
+with gr.Blocks(title="EcoMonitor", theme=gr.themes.Base()) as demo:
+    with gr.Row():
+        gr.Markdown(
+        """
+        <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+          <div>
+            <h1>Search-TTA: A Multimodal Test-Time Adaptation Framework for Visual Search in the Wild</h1>
+            <span></span>
+            <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
+            <a href="https://search-tta.github.io">Project Website</a>
+            </h2>
+          </div>
+        </div>
+        """
+        # <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>WACV 2025</h2>
+        #     <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
+        #     <a href="https://derektan95.github.io">Derek M. S. Tan</a>,
+        #     <a href="https://chinchinati.github.io/">Shailesh</a>,
+        #     <a href="https://www.linkedin.com/in/boyang-liu-nus">Boyang Liu</a>,
+        #     <a href="https://www.linkedin.com/in/loki-silvres">Alok Raj</a>,
+        #     <a href="https://www.linkedin.com/in/ang-qi-xuan-714347142">Qi Xuan Ang</a>,
+        #     <a href="https://weihengdai.top">Weiheng Dai</a>,
+        #     <a href="https://www.linkedin.com/in/tanishqduhan">Tanishq Duhan</a>,
+        #     <a href="https://www.linkedin.com/in/jimmychiun">Jimmy Chiun</a>,
+        #     <a href="https://www.yuhongcao.online/">Yuhong Cao</a>,
+        #     <a href="https://www.cs.toronto.edu/~florian/">Florian Shkurti</a>,
+        #     <a href="https://www.marmotlab.org/bio.html">Guillaume Sartoretti</a>
+        # </h2>
+        # <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>National University of Singapore, University of Toronto, IIT-Dhanbad, Singapore Technologies Engineering</h2>
+        )
+    with gr.Row(variant="panel"):
+        # LEFT COLUMN  (satellite, taxonomy, run)
+        with gr.Column():
+            sat_input = gr.Image(
+                label="Satellite Image",
+                sources=["upload"],
+                type="pil",
+                height=320,
+            )
+            taxonomy_input = gr.Textbox(
+                label="Full Taxonomy Name (optional)",
+                placeholder="e.g. Animalia Chordata Mammalia Carnivora Ursidae Ursus arctos",
+            )
+            run_btn = gr.Button("Run", variant="primary")
+        # RIGHT COLUMN  (ground image + two heat-maps)
+        with gr.Column():
+            ground_input = gr.Image(
+                label="Ground-level Image (optional)",
+                sources=["upload"],
+                type="pil",
+                height=320,
+            )
+            heat_ground_out = gr.Image(
+                label="Heat-map (Ground query)",
+                height=160,
+            )
+            heat_text_out = gr.Image(
+                label="Heat-map (Text query)",
+                height=160,
+            )
+    # EXAMPLES
+    with gr.Row():
+        gr.Examples(
+            examples=[
+                [
+                    "examples/NAIP_yosemite_v3_resized.png",
+                    "Animalia Chordata Mammalia Carnivora Ursidae Ursus americanus",
+                    "examples/american_black_bear_inat_248820933.jpeg",
+                ],
+                # [
+                #     "examples/satellite_coast.png",
+                #     "",
+                #     "examples/ground_gull.jpg",
+                # ],
+                # [
+                #     "examples/satellite_coast.png",
+                #     "Animalia Chordata Aves Charadriiformes Laridae Larus argentatus",
+                #     None,
+                # ],
+            ],
+            inputs=[sat_input, taxonomy_input, ground_input],
+            outputs=[heat_ground_out, heat_text_out],
+            fn=process,
+            cache_examples=False,
+        )
+    # CALLBACK
+    run_btn.click(
+        fn=process,
+        inputs=[sat_input, taxonomy_input, ground_input],
+        outputs=[heat_ground_out, heat_text_out],
+    )
+    # Footer to point out to model and data from app page.
+    gr.Markdown(
+        """
+        This model is fine-tuned using [Sentinel-2 Level 2A](https://docs.sentinel-hub.com/api/latest/data/sentinel-2-l2a/) satellite images and taxonomy images and locations from [iNaturalist](https://inaturalist.org/).
+        """
+    )
+# LAUNCH
+if __name__ == "__main__":
+    demo.queue(max_size=15)
+    demo.launch(share=True)

clip_vision_per_patch_model.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+from transformers import CLIPVisionModelWithProjection, CLIPVisionConfig
+class CLIPVisionPerPatchModel(CLIPVisionModelWithProjection):
+    """
+    Like CLIPVisionModelWithProjection but returns
+    per-patch embeddings instead of pooled CLS tokens.
+    """
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        # everything else (self.vision_model, self.visual_projection)
+        # is set up for you by the parent class
+    def forward(self, pixel_values, **kwargs):
+        # 1) run the ViT backbone → last_hidden_state [B, n_patches, hidden_size]
+        outputs = self.vision_model(pixel_values, return_dict=True, **kwargs)
+        hidden_states = outputs.last_hidden_state
+        # 2) project every patch token → [B, n_patches, projection_dim]
+        patch_embeds = self.visual_projection(hidden_states)
+        # 3) Postprocessing embeds
+        patch_embeds = torch.nn.functional.normalize(patch_embeds, dim=-1)
+        patch_embeds = patch_embeds.squeeze()   # (Patches, proj_dim)
+        return patch_embeds

examples/NAIP_yosemite_v3_resized.png ADDED Viewed

Git LFS Details

SHA256: 58ffc2032198596f427e5c1bb176bd694f427a810d0302b49260c81aea69fe87
Pointer size: 131 Bytes
Size of remote file: 458 kB

examples/american_black_bear_inat_248820933.jpeg ADDED Viewed

Git LFS Details

SHA256: d65b3c63c5c6cd98d8a324cd46a9b1216eacd1ce245a357f04409083cd28d944
Pointer size: 131 Bytes
Size of remote file: 266 kB

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+# python 3.10.14
+numpy==1.26.3
+torch==2.4.1
+torchvision==0.19.1
+pytorch-lightning==2.2.1
+open_clip_torch==2.30.0
+transformers==4.45.1
+tokenizers==0.20.3
+opencv-python==4.10.0.84
+gradio==3.39.0