Spaces:

Jaocs
/

gs_final

Paused

App Files Files Community

Jaocs commited on Jul 13

Commit

c096a7a

1 Parent(s): 0f0079f

application3

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +72 -0
LICENSE.txt +12 -0
README.md +232 -8
configs/gs/base.yaml +51 -0
configs/train.yaml +38 -0
extra/archive/rasterizer_impl.h +75 -0
extra/archive/simple-knn.patch.txt +13 -0
full_eval.py +102 -0
gradio_demo.py +424 -0
install.sh +35 -0
main.py +268 -0
metrics.py +115 -0
requirements.txt +1 -0
script.bash +0 -0
source/EDGS.code-workspace +11 -0
source/__init__.py +0 -0
source/corr_init.py +907 -0
source/data_utils.py +28 -0
source/losses.py +100 -0
source/networks.py +48 -0
source/timer.py +24 -0
source/trainer.py +265 -0
source/utils_aux.py +92 -0
source/utils_preprocess.py +334 -0
source/visualization.py +1072 -0
submodules/RoMa/.gitignore +11 -0
submodules/RoMa/LICENSE +21 -0
submodules/RoMa/README.md +123 -0
submodules/RoMa/data/.gitignore +2 -0
submodules/RoMa/demo/demo_3D_effect.py +47 -0
submodules/RoMa/demo/demo_fundamental.py +34 -0
submodules/RoMa/demo/demo_match.py +50 -0
submodules/RoMa/demo/demo_match_opencv_sift.py +43 -0
submodules/RoMa/demo/demo_match_tiny.py +77 -0
submodules/RoMa/demo/gif/.gitignore +2 -0
submodules/RoMa/experiments/eval_roma_outdoor.py +57 -0
submodules/RoMa/experiments/eval_tiny_roma_v1_outdoor.py +84 -0
submodules/RoMa/experiments/roma_indoor.py +320 -0
submodules/RoMa/experiments/train_roma_outdoor.py +307 -0
submodules/RoMa/experiments/train_tiny_roma_v1_outdoor.py +498 -0
submodules/RoMa/requirements.txt +14 -0
submodules/RoMa/romatch/__init__.py +8 -0
submodules/RoMa/romatch/benchmarks/__init__.py +6 -0
submodules/RoMa/romatch/benchmarks/hpatches_sequences_homog_benchmark.py +113 -0
submodules/RoMa/romatch/benchmarks/megadepth_dense_benchmark.py +106 -0
submodules/RoMa/romatch/benchmarks/megadepth_pose_estimation_benchmark.py +118 -0
submodules/RoMa/romatch/benchmarks/megadepth_pose_estimation_benchmark_poselib.py +119 -0
submodules/RoMa/romatch/benchmarks/scannet_benchmark.py +143 -0
submodules/RoMa/romatch/checkpointing/__init__.py +1 -0
submodules/RoMa/romatch/checkpointing/checkpoint.py +60 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,72 @@

+FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 AS builder
+WORKDIR /app
+COPY . /app/
+ENV CUDA_HOME=/usr/local/cuda-12.1
+ENV LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+ENV PATH=$CUDA_HOME/bin:$PATH
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+RUN apt-get update && \
+DEBIAN_FRONTEND=noninteractive apt-get install -y \
+build-essential wget curl nano ninja-build unzip libgl-dev ffmpeg && \
+apt-get clean && \
+rm -rf /var/lib/apt/lists/*
+ENV CONDA_DIR=/opt/conda
+RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
+/bin/bash ~/miniconda.sh -b -p /opt/conda && \
+rm ~/miniconda.sh
+ENV PATH=$CONDA_DIR/bin:$PATH
+RUN conda update -n base conda -y && \
+conda install -n base conda-libmamba-solver -y && \
+conda config --set solver libmamba
+RUN conda create -y -n edgs python=3.10 pip
+SHELL ["conda", "run", "-n", "edgs", "/bin/bash", "-c"]
+RUN conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y
+RUN pip install -e submodules/gaussian-splatting/submodules/diff-gaussian-rasterization --no-build-isolation && \
+    pip install -e submodules/gaussian-splatting/submodules/simple-knn --no-build-isolation
+RUN pip install pycolmap
+RUN pip install wandb hydra-core tqdm torchmetrics lpips matplotlib rich plyfile imageio imageio-ffmpeg && \
+    conda install numpy=1.26.4 -y -c conda-forge --override-channels
+RUN pip install -e submodules/RoMa
+RUN pip install plotly scikit-learn moviepy==2.1.1 ffmpeg fastapi[standard]
+# Imagen final
+FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04 AS final
+WORKDIR /app
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    libgl1-mesa-glx libsm6 libxext6 ffmpeg && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+COPY --from=builder /opt/conda /opt/conda
+COPY --from=builder /app /app
+ENV PATH="/opt/conda/bin:/opt/conda/envs/edgs/bin:$PATH"
+ENV CUDA_HOME=/usr/local/cuda-12.1
+ENV LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+Copyright 2025, Dmytro Kotovenko, Olga Grebenkova, Björn Ommer
+Redistribution and use in source and binary forms, with or without modification, are permitted for non-commercial academic research and/or non-commercial personal use only provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+Any use of this software beyond the above specified conditions requires a separate license. Please contact the copyright holders to discuss license terms.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

README.md CHANGED Viewed

@@ -1,10 +1,234 @@
 ---
-title: Gs Final
-emoji: 🐢
-colorFrom: gray
-colorTo: green
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<h1 align="center">EDGS: Eliminating Densification for Efficient Convergence of 3DGS</h2>
+<p align="center">
+  <a href="https://www.linkedin.com/in/dmitry-kotovenko-dl/">Dmytro Kotovenko</a><sup>*</sup> ·
+  <a href="https://www.linkedin.com/in/grebenkovao/">Olga Grebenkova</a><sup>*</sup> ·
+  <a href="https://ommer-lab.com/people/ommer/">Björn Ommer</a>
+</p>
+<p align="center">CompVis @ LMU Munich · Munich Center for Machine Learning (MCML) </p>
+<p align="center">* equal contribution </p>
+<p align="center">
+  <a href="https://compvis.github.io/EDGS/"><img src="https://img.shields.io/badge/Project-Page-blue" alt="Project Page"></a>
+  <a href="https://arxiv.org/pdf/2504.13204"><img src="https://img.shields.io/badge/arXiv-PDF-b31b1b" alt="Paper"></a>
+  <a href="https://colab.research.google.com/github/CompVis/EDGS/blob/main/notebooks/fit_model_to_scene_full.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>
+  <a href="https://huggingface.co/spaces/CompVis/EDGS"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue" alt="Hugging Face"></a>
+</p>
+<p align="center">
+  <img src="./assets/Teaser2.png" width="99%">
+</p>
+<p>
+<strong>3DGS</strong> initializes with a sparse set of Gaussians and progressively adds more in under-reconstructed regions. In contrast, <strong>EDGS</strong> starts with
+a dense initialization from triangulated 2D correspondences across training image pairs,
+requiring only minimal refinement. This leads to <strong>faster convergence</strong> and <strong>higher rendering quality</strong>. Our method reaches the original 3DGS <strong>LPIPS score in just 25% of the training time</strong> and uses only <strong>60% of the splats</strong>.
+Renderings become <strong>nearly indistinguishable from ground truth after only 3,000 steps — without any densification</strong>.
+</p>
+<h3 align="center">3D scene reconstruction using our method in 11 seconds.</h3>
+<p align="center">
+  <img src="assets/video_fruits_our_optimization.gif" width="480" alt="3D Reconstruction Demo">
+</p>
+## 📚 Table of Contents
+- [🚀 Quickstart](#sec-quickstart)
+- [🛠️ Installation](#sec-install)
+- [📦 Data](#sec-data)
+- [🏋️ Training](#sec-training)
+- [🏗️ Reusing Our Model](#sec-reuse)
+- [📄 Citation](#sec-citation)
+<a id="sec-quickstart"></a>
+## 🚀 Quickstart
+The fastest way to try our model is through the [Hugging Face demo](https://huggingface.co/spaces/magistrkoljan/EDGS), which lets you upload images or a video and interactively rotate the resulting 3D scene. For broad accessibility, we currently support only **forward-facing scenes**.
+#### Steps:
+1. Upload a list of photos or a single video.
+2. Click **📸 Preprocess Input** to estimate 3D positions using COLMAP.
+3. Click **🚀 Start Reconstruction** to run the model.
+You can also **explore the reconstructed scene in 3D** directly in the browser.
+> ⚡ Runtime: EDGS typically takes just **10–20 seconds**, plus **5–10 seconds** for COLMAP processing. Additional time may be needed to save outputs (model, video, 3D preview).
+You can also run the same app locally on your machine with command:
+```CUDA_VISIBLE_DEVICES=0 python gradio_demo.py --port 7862 --no_share```
+Without `--no_share` flag you will get the adress for gradio app that you can share with the others allowing others to process their data on your server.
+Alternatively, check our [Colab notebook](https://colab.research.google.com/github/CompVis/EDGS/blob/main/notebooks/fit_model_to_scene_full.ipynb).
+###
+<a id="sec-install"></a>
+## 🛠️ Installation
+You can either run `install.sh` or manually install using the following:
+```bash
+git clone [email protected]:CompVis/EDGS.git --recursive
+cd EDGS
+git submodule update --init --recursive
+conda create -y -n edgs python=3.10 pip
+conda activate edgs
+# Set up path to your CUDA. In our experience similar versions like 12.2 also work well
+export CUDA_HOME=/usr/local/cuda-12.1
+export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+export PATH=$CUDA_HOME/bin:$PATH
+conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y
+conda install nvidia/label/cuda-12.1.0::cuda-toolkit -y
+pip install -e submodules/gaussian-splatting/submodules/diff-gaussian-rasterization
+pip install -e submodules/gaussian-splatting/submodules/simple-knn
+# For COLMAP and pycolmap
+# Optionally install original colmap but probably pycolmap suffices
+# conda install conda-forge/label/colmap_dev::colmap
+pip install pycolmap
+pip install wandb hydra-core tqdm torchmetrics lpips matplotlib rich plyfile imageio imageio-ffmpeg
+conda install numpy=1.26.4 -y -c conda-forge --override-channels
+pip install -e submodules/RoMa
+conda install anaconda::jupyter --yes
+# Stuff necessary for gradio and visualizations
+pip install gradio
+pip install plotly scikit-learn moviepy==2.1.1 ffmpeg
+pip install open3d
+```
+<a id="sec-data"></a>
+## 📦 Data
+We evaluated on the following datasets:
+- **MipNeRF360** — download [here](https://jonbarron.info/mipnerf360/). Unzip "Dataset Pt. 1" and "Dataset Pt. 2", then merge scenes.
+- **Tanks & Temples + Deep Blending** — from the [original 3DGS repo](https://repo-sam.inria.fr/fungraph/3d-gaussian-splatting/datasets/input/tandt_db.zip).
+### Using Your Own Dataset
+You can use the same data format as the [3DGS project](https://github.com/graphdeco-inria/gaussian-splatting?tab=readme-ov-file#processing-your-own-scenes). Please follow their guide to prepare your scene.
+Expected folder structure:
+```
+scene_folder
+|---images
+|   |---<image 0>
+|   |---<image 1>
+|   |---...
+|---sparse
+    |---0
+        |---cameras.bin
+        |---images.bin
+        |---points3D.bin
+```
+Nerf synthetic format is also acceptable.
+You can also use functions provided in our code to convert a collection of images or a sinlge video into a desired format. However, this may requre tweaking and processing time can be large for large collection of images with little overlap.
+<a id="sec-training"></a>
+## 🏋️ Training
+To optimize on a single scene in COLMAP format use this code.
+```bash
+python train.py \
+  train.gs_epochs=30000 \
+  train.no_densify=True \
+  gs.dataset.source_path=<scene folder> \
+  gs.dataset.model_path=<output folder> \
+  init_wC.matches_per_ref=20000 \
+  init_wC.nns_per_ref=3 \
+  init_wC.num_refs=180
+```
+<details>
+<summary><span style="font-weight: bold;">Command Line Arguments for train.py</span></summary>
+  * `train.gs_epochs`
+  Number of training iterations (steps) for Gaussian Splatting.
+  * `train.no_densify`
+  Disables densification. True by default.
+  * `gs.dataset.source_path`
+  Path to your input dataset directory. This should follow the same format as the original 3DGS dataset structure.
+  * `gs.dataset.model_path`
+  Output directory where the trained model, logs, and renderings will be saved.
+  * `init_wC.matches_per_ref`
+  Number of 2D feature correspondences to extract per reference view for initialization. More matches leads to more gaussians.
+  * `init_wC.nns_per_ref`
+  Number of nearest neighbor images used per reference during matching.
+  * `init_wC.num_refs`
+  Total number of reference views sampled.
+  * `wandb.mode`
+    Specifies how Weights & Biases (W&B) logging is handled.
+    - Default: `"disabled"`
+    - Options:
+      - `"online"` — log to the W&B server in real-time
+      - `"offline"` — save logs locally to sync later
+      - `"disabled"` — turn off W&B logging entirely
+    If you want to enable W&B logging, make sure to also configure:
+    - `wandb.project` — the name of your W&B project
+    - `wandb.entity` — your W&B username or team name
+Example override:
+```bash
+wandb.mode=online wandb.project=EDGS wandb.entity=your_username train.gs_epochs=15_000 init_wC.matches_per_ref=15_000
+```
+</details>
+<br>
+To run full evaluation on all datasets:
+```bash
+python full_eval.py -m360 <mipnerf360 folder> -tat <tanks and temples folder> -db <deep blending folder>
+```
+<a id="sec-reuse"></a>
+## 🏗️ Reusing Our Model
+Our model is essentially a better **initialization module** for Gaussian Splatting. You can integrate it into your pipeline by calling:
+```python
+source.corr_init.init_gaussians_with_corr(...)
+```
+### Input arguments:
+- A GaussianModel and Scene instance
+- A configuration namespace `cfg.init_wC` to specify parameters like the number of matches, neighbors, and reference views
+- A RoMA model (automatically instantiated if not provided)
+<a id="sec-citation"></a>
+## 📄 Citation
+```bibtex
+@misc{kotovenko2025edgseliminatingdensificationefficient,
+      title={EDGS: Eliminating Densification for Efficient Convergence of 3DGS},
+      author={Dmytro Kotovenko and Olga Grebenkova and Björn Ommer},
+      year={2025},
+      eprint={2504.13204},
+      archivePrefix={arXiv},
+      primaryClass={cs.GR},
+      url={https://arxiv.org/abs/2504.13204},
+}
+```
 ---
+# TODO:
+- [ ] Code for training and processing forward-facing scenes.
+- [ ] More data examples

configs/gs/base.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+_target_: source.networks.Warper3DGS
+verbose: True
+viewpoint_stack: !!null
+sh_degree: 3
+opt:
+  iterations: 30000
+  position_lr_init: 0.00016
+  position_lr_final: 1.6e-06
+  position_lr_delay_mult: 0.01
+  position_lr_max_steps: 30000
+  feature_lr: 0.0025
+  opacity_lr: 0.025
+  scaling_lr: 0.005
+  rotation_lr: 0.001
+  percent_dense: 0.01
+  lambda_dssim: 0.2
+  densification_interval: 100
+  opacity_reset_interval: 30000
+  densify_from_iter: 500
+  densify_until_iter: 15000
+  densify_grad_threshold: 0.0002
+  random_background: false
+  save_iterations: [3000, 7000, 15000, 30000]
+  batch_size: 64
+  exposure_lr_init: 0.01
+  exposure_lr_final: 0.0001
+  exposure_lr_delay_steps: 0
+  exposure_lr_delay_mult: 0.0
+  TRAIN_CAM_IDX_TO_LOG: 50
+  TEST_CAM_IDX_TO_LOG: 10
+pipe:
+  convert_SHs_python: False
+  compute_cov3D_python: False
+  debug: False
+  antialiasing: False
+dataset:
+  densify_until_iter: 15000
+  source_path:  '' #path to dataset
+  model_path:  '' #path to logs
+  images: images
+  resolution: -1
+  white_background: false
+  data_device: cuda
+  eval: false
+  depths: ""
+  train_test_exp: False

configs/train.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+defaults:
+  - gs: base
+  - _self_
+seed: 228
+wandb:
+  mode: "online" # "disabled" for no logging
+  entity: "3dcorrespondence"
+  project: "Adv3DGS"
+  group: null
+  name: null
+  tag: "debug"
+train:
+  gs_epochs: 0 # number of 3dgs iterations
+  reduce_opacity: True
+  no_densify: False # if True, the model will not be densified
+  max_lr: True
+load:
+  gs: null #path to 3dgs checkpoint
+  gs_step: null #number of iterations, e.g. 7000
+device: "cuda:0"
+verbose: true
+init_wC:
+  use: True # use EDGS
+  matches_per_ref: 15_000 # number of matches per reference
+  num_refs: 180 # number of reference images
+  nns_per_ref: 3 # number of nearest neighbors per reference
+  scaling_factor: 0.001
+  proj_err_tolerance: 0.01
+  roma_model: "outdoors" # you can change this to "indoors" or "outdoors"
+  add_SfM_init : False

extra/archive/rasterizer_impl.h ADDED Viewed

	@@ -0,0 +1,75 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  [email protected]
+ */
+#pragma once
+#include <cstdint>
+#include <iostream>
+#include <vector>
+#include "rasterizer.h"
+#include <cuda_runtime_api.h>
+namespace CudaRasterizer
+{
+	template <typename T>
+	static void obtain(char*& chunk, T*& ptr, std::size_t count, std::size_t alignment)
+	{
+		std::size_t offset = (reinterpret_cast<std::uintptr_t>(chunk) + alignment - 1) & ~(alignment - 1);
+		ptr = reinterpret_cast<T*>(offset);
+		chunk = reinterpret_cast<char*>(ptr + count);
+	}
+	struct GeometryState
+	{
+		size_t scan_size;
+		float* depths;
+		char* scanning_space;
+		bool* clamped;
+		int* internal_radii;
+		float2* means2D;
+		float* cov3D;
+		float4* conic_opacity;
+		float* rgb;
+		uint32_t* point_offsets;
+		uint32_t* tiles_touched;
+		static GeometryState fromChunk(char*& chunk, size_t P);
+	};
+	struct ImageState
+	{
+		uint2* ranges;
+		uint32_t* n_contrib;
+		float* accum_alpha;
+		static ImageState fromChunk(char*& chunk, size_t N);
+	};
+	struct BinningState
+	{
+		size_t sorting_size;
+		uint64_t* point_list_keys_unsorted;
+		uint64_t* point_list_keys;
+		uint32_t* point_list_unsorted;
+		uint32_t* point_list;
+		char* list_sorting_space;
+		static BinningState fromChunk(char*& chunk, size_t P);
+	};
+	template<typename T>
+	size_t required(size_t P)
+	{
+		char* size = nullptr;
+		T::fromChunk(size, P);
+		return ((size_t)size) + 128;
+	}
+};

extra/archive/simple-knn.patch.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+diff --git a/simple_knn.cu b/simple_knn.cu
+index e72e4c9..b2deb1b 100644
+--- a/simple_knn.cu
++++ b/simple_knn.cu
+@@ -11,6 +11,8 @@
+ #define BOX_SIZE 1024
++#include <float.h>
++
+ #include "cuda_runtime.h"
+ #include "device_launch_parameters.h"
+ #include "simple_knn.h"

full_eval.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import os
+from argparse import ArgumentParser
+mipnerf360_outdoor_scenes = ["bicycle", "flowers", "garden", "stump", "treehill"]
+mipnerf360_indoor_scenes = ["room", "counter", "kitchen", "bonsai"]
+tanks_and_temples_scenes = ["truck", "train"]
+deep_blending_scenes = ["drjohnson", "playroom"]
+parser = ArgumentParser(description="Full evaluation script parameters")
+parser.add_argument("--skip_training", action="store_true")
+parser.add_argument("--skip_rendering", action="store_true")
+parser.add_argument("--skip_metrics", action="store_true")
+parser.add_argument("--output_path", default="./eval")
+args, _ = parser.parse_known_args()
+all_scenes = []
+all_scenes.extend(mipnerf360_outdoor_scenes)
+all_scenes.extend(mipnerf360_indoor_scenes)
+all_scenes.extend(tanks_and_temples_scenes)
+all_scenes.extend(deep_blending_scenes)
+if not args.skip_training or not args.skip_rendering:
+    parser.add_argument('--mipnerf360', "-m360", required=True, type=str)
+    parser.add_argument("--tanksandtemples", "-tat", required=True, type=str)
+    parser.add_argument("--deepblending", "-db", required=True, type=str)
+    args = parser.parse_args()
+if not args.skip_training:
+    name = "EDGS_"
+    common_args = " --quiet --eval --test_iterations -1 "
+    for scene in mipnerf360_outdoor_scenes:
+        source = args.mipnerf360 + "/" + scene
+        experiment = name + scene
+        os.system(f"python train.py  verbose=True gs.dataset.source_path={source} gs.dataset.model_path={args.output_path}/mipnerf/{scene} wandb.name={experiment} init_wC.use=True train.gs_epochs=30000 init_wC.matches_per_ref=25_000 init_wC.nns_per_ref=3 gs.dataset.images=images_4 init_wC.num_refs=180 train.no_densify=True")
+    for scene in mipnerf360_indoor_scenes:
+        source = args.mipnerf360 + "/" + scene
+        experiment = name + scene
+        os.system(f"python train.py  verbose=True gs.dataset.source_path={source} gs.dataset.model_path={args.output_path}/mipnerf/{scene} wandb.name={experiment} init_wC.use=True train.gs_epochs=30000  init_wC.matches_per_ref=25_000 init_wC.nns_per_ref=3 gs.dataset.images=images_2 init_wC.num_refs=180 train.no_densify=True")
+    for scene in tanks_and_temples_scenes:
+        source = args.tanksandtemples + "/" + scene
+        experiment = name + scene +"_tandt"
+        os.system(f"python train.py  verbose=True gs.dataset.source_path={source} gs.dataset.model_path={args.output_path}/mipnerf/{scene} wandb.name={experiment} init_wC.use=True train.gs_epochs=30000  init_wC.matches_per_ref=15_000 init_wC.nns_per_ref=3 init_wC.num_refs=180 train.no_densify=True")
+    for scene in deep_blending_scenes:
+        source = args.deepblending + "/" + scene
+        experiment = name + scene + "_db"
+        os.system(f"python train.py  verbose=True gs.dataset.source_path={source} gs.dataset.model_path={args.output_path}/mipnerf/{scene} wandb.name={experiment} init_wC.use=True train.gs_epochs=30000 init_wC.matches_per_ref=15_000 init_wC.nns_per_ref=3 init_wC.num_refs=180 train.no_densify=True")
+if not args.skip_rendering:
+    all_sources = []
+    for scene in mipnerf360_outdoor_scenes:
+        all_sources.append(args.mipnerf360 + "/" + scene)
+    for scene in mipnerf360_indoor_scenes:
+        all_sources.append(args.mipnerf360 + "/" + scene)
+    for scene in tanks_and_temples_scenes:
+        all_sources.append(args.tanksandtemples + "/" + scene )
+    for scene in deep_blending_scenes:
+        all_sources.append(args.deepblending + "/" + scene)
+    all_outputs = []
+    for scene in mipnerf360_outdoor_scenes:
+        all_outputs.append(args.output_path + "/mipnerf/" + scene)
+    for scene in mipnerf360_indoor_scenes:
+        all_outputs.append(args.output_path + "/mipnerf/" + scene)
+    for scene in tanks_and_temples_scenes:
+        all_outputs.append(args.output_path + "/tandt/" + scene)
+    for scene in deep_blending_scenes:
+        all_outputs.append(args.output_path + "/db/" + scene)
+    common_args = " --quiet --eval --skip_train"
+    for scene, source, output in zip(all_scenes, all_sources, all_outputs):
+        os.system("python ./submodules/gaussian-splatting/render.py --iteration 7000 -s " + source + " -m " + output + common_args)
+        os.system("python ./submodules/gaussian-splatting/render.py --iteration 30000 -s " + source + " -m " + output + common_args)
+if not args.skip_metrics:
+    all_outputs = []
+    for scene in mipnerf360_outdoor_scenes:
+        all_outputs.append(args.output_path + "/mipnerf/" + scene)
+    for scene in mipnerf360_indoor_scenes:
+        all_outputs.append(args.output_path + "/mipnerf/" + scene)
+    for scene in tanks_and_temples_scenes:
+        all_outputs.append(args.output_path + "/tandt/" + scene)
+    for scene in deep_blending_scenes:
+        all_outputs.append(args.output_path + "/db/" + scene)
+    scenes_string = ""
+    for scene, output in zip(all_scenes, all_outputs):
+        scenes_string += "\"" + output + "\" "
+    os.system("python metrics.py -m " + scenes_string)

gradio_demo.py ADDED Viewed

	@@ -0,0 +1,424 @@

+import torch
+import os
+import shutil
+import tempfile
+import argparse
+import gradio as gr
+import sys
+import io
+from PIL import Image
+import numpy as np
+from source.utils_aux import set_seed
+from source.utils_preprocess import read_video_frames, preprocess_frames, select_optimal_frames, save_frames_to_scene_dir, run_colmap_on_scene
+from source.trainer import EDGSTrainer
+from hydra import initialize, compose
+import hydra
+import time
+from source.visualization import generate_circular_camera_path, save_numpy_frames_as_mp4, generate_fully_smooth_cameras_with_tsp, put_text_on_image
+import contextlib
+import base64
+# Init RoMA model:
+sys.path.append('../submodules/RoMa')
+from romatch import roma_outdoor, roma_indoor
+roma_model = roma_indoor(device="cuda:0")
+roma_model.upsample_preds = False
+roma_model.symmetric = False
+STATIC_FILE_SERVING_FOLDER = "./served_files"
+MODEL_PATH = None
+os.makedirs(STATIC_FILE_SERVING_FOLDER, exist_ok=True)
+trainer = None
+class Tee(io.TextIOBase):
+    def __init__(self, *streams):
+        self.streams = streams
+    def write(self, data):
+        for stream in self.streams:
+            stream.write(data)
+        return len(data)
+    def flush(self):
+        for stream in self.streams:
+            stream.flush()
+def capture_logs(func, *args, **kwargs):
+    log_capture_string = io.StringIO()
+    tee = Tee(sys.__stdout__, log_capture_string)
+    with contextlib.redirect_stdout(tee):
+        result = func(*args, **kwargs)
+    return result, log_capture_string.getvalue()
+# Training Pipeline
+def run_training_pipeline(scene_dir,
+                          num_ref_views=16,
+                          num_corrs_per_view=20000,
+                          num_steps=1_000,
+                          mode_toggle="Ours (EDGS)"):
+    with initialize(config_path="./configs", version_base="1.1"):
+        cfg = compose(config_name="train")
+    scene_name = os.path.basename(scene_dir)
+    model_output_dir = f"./outputs/{scene_name}_trained"
+    cfg.wandb.mode = "disabled"
+    cfg.gs.dataset.model_path = model_output_dir
+    cfg.gs.dataset.source_path = scene_dir
+    cfg.gs.dataset.images = "images"
+    cfg.gs.opt.TEST_CAM_IDX_TO_LOG = 12
+    cfg.train.gs_epochs = 30000
+    if mode_toggle=="Ours (EDGS)":
+        cfg.gs.opt.opacity_reset_interval = 1_000_000
+        cfg.train.reduce_opacity = True
+        cfg.train.no_densify = True
+        cfg.train.max_lr = True
+        cfg.init_wC.use = True
+        cfg.init_wC.matches_per_ref = num_corrs_per_view
+        cfg.init_wC.nns_per_ref = 1
+        cfg.init_wC.num_refs = num_ref_views
+        cfg.init_wC.add_SfM_init = False
+        cfg.init_wC.scaling_factor = 0.00077 * 2.
+    set_seed(cfg.seed)
+    os.makedirs(cfg.gs.dataset.model_path, exist_ok=True)
+    global trainer
+    global MODEL_PATH
+    generator3dgs = hydra.utils.instantiate(cfg.gs, do_train_test_split=False)
+    trainer = EDGSTrainer(GS=generator3dgs, training_config=cfg.gs.opt, device=cfg.device, log_wandb=cfg.wandb.mode != 'disabled')
+    # Disable evaluation and saving
+    trainer.saving_iterations = []
+    trainer.evaluate_iterations = []
+    # Initialize
+    trainer.timer.start()
+    start_time = time.time()
+    trainer.init_with_corr(cfg.init_wC, roma_model=roma_model)
+    time_for_init = time.time()-start_time
+    viewpoint_cams = trainer.GS.scene.getTrainCameras()
+    path_cameras = generate_fully_smooth_cameras_with_tsp(existing_cameras=viewpoint_cams,
+                                                          n_selected=6, # 8
+                                                          n_points_per_segment=30, # 30
+                                                          closed=False)
+    path_cameras = path_cameras + path_cameras[::-1]
+    path_renderings = []
+    idx = 0
+    # Visualize after init
+    for _ in range(120):
+        with torch.no_grad():
+            viewpoint_cam = path_cameras[idx]
+            idx = (idx + 1) % len(path_cameras)
+            render_pkg = trainer.GS(viewpoint_cam)
+            image = render_pkg["render"]
+            image_np = np.clip(image.detach().cpu().numpy().transpose(1, 2, 0), 0, 1)
+            image_np = (image_np * 255).astype(np.uint8)
+            path_renderings.append(put_text_on_image(img=image_np,
+                                                     text=f"Init stage.\nTime:{time_for_init:.3f}s.   "))
+    path_renderings = path_renderings + [put_text_on_image(img=image_np, text=f"Start fitting.\nTime:{time_for_init:.3f}s.   ")]*30
+    # Train and save visualizations during training.
+    start_time = time.time()
+    for _ in range(int(num_steps//10)):
+        with torch.no_grad():
+            viewpoint_cam = path_cameras[idx]
+            idx = (idx + 1) % len(path_cameras)
+            render_pkg = trainer.GS(viewpoint_cam)
+            image = render_pkg["render"]
+            image_np = np.clip(image.detach().cpu().numpy().transpose(1, 2, 0), 0, 1)
+            image_np = (image_np * 255).astype(np.uint8)
+            path_renderings.append(put_text_on_image(
+                img=image_np,
+                text=f"Fitting stage.\nTime:{time_for_init + time.time()-start_time:.3f}s.   "))
+        cfg.train.gs_epochs = 10
+        trainer.train(cfg.train)
+        print(f"Time elapsed: {(time_for_init + time.time()-start_time):.2f}s.")
+        # if (cfg.init_wC.use == False) and (time_for_init + time.time()-start_time) > 60:
+        #     break
+    final_time = time.time()
+    # Add static frame. To highlight we're done
+    path_renderings += [put_text_on_image(
+        img=image_np, text=f"Done.\nTime:{time_for_init + final_time -start_time:.3f}s.   ")]*30
+    # Final rendering at the end.
+    for _ in range(len(path_cameras)):
+        with torch.no_grad():
+            viewpoint_cam = path_cameras[idx]
+            idx = (idx + 1) % len(path_cameras)
+            render_pkg = trainer.GS(viewpoint_cam)
+            image = render_pkg["render"]
+            image_np = np.clip(image.detach().cpu().numpy().transpose(1, 2, 0), 0, 1)
+            image_np = (image_np * 255).astype(np.uint8)
+            path_renderings.append(put_text_on_image(img=image_np,
+                                                 text=f"Final result.\nTime:{time_for_init + final_time -start_time:.3f}s.   "))
+    trainer.save_model()
+    final_video_path = os.path.join(STATIC_FILE_SERVING_FOLDER, f"{scene_name}_final.mp4")
+    save_numpy_frames_as_mp4(frames=path_renderings, output_path=final_video_path, fps=30, center_crop=0.85)
+    MODEL_PATH = cfg.gs.dataset.model_path
+    ply_path = os.path.join(cfg.gs.dataset.model_path, f"point_cloud/iteration_{trainer.gs_step}/point_cloud.ply")
+    shutil.copy(ply_path, os.path.join(STATIC_FILE_SERVING_FOLDER, "point_cloud_final.ply"))
+    return final_video_path, ply_path
+# Gradio Interface
+def gradio_interface(input_path, num_ref_views, num_corrs, num_steps):
+    images, scene_dir = run_full_pipeline(input_path, num_ref_views, num_corrs, max_size=1024)
+    shutil.copytree(scene_dir, STATIC_FILE_SERVING_FOLDER+'/scene_colmaped',  dirs_exist_ok=True)
+    (final_video_path, ply_path), log_output = capture_logs(run_training_pipeline,
+                                                            scene_dir,
+                                                            num_ref_views,
+                                                            num_corrs,
+                                                            num_steps)
+    images_rgb = [img[:, :, ::-1] for img in images]
+    return images_rgb, final_video_path, scene_dir, ply_path, log_output
+# Dummy Render Functions
+def render_all_views(scene_dir):
+    viewpoint_cams = trainer.GS.scene.getTrainCameras()
+    path_cameras = generate_fully_smooth_cameras_with_tsp(existing_cameras=viewpoint_cams,
+                                                          n_selected=8,
+                                                          n_points_per_segment=60,
+                                                          closed=False)
+    path_cameras = path_cameras + path_cameras[::-1]
+    path_renderings = []
+    with torch.no_grad():
+        for viewpoint_cam in path_cameras:
+            render_pkg = trainer.GS(viewpoint_cam)
+            image = render_pkg["render"]
+            image_np = np.clip(image.detach().cpu().numpy().transpose(1, 2, 0), 0, 1)
+            image_np = (image_np * 255).astype(np.uint8)
+            path_renderings.append(image_np)
+    save_numpy_frames_as_mp4(frames=path_renderings,
+                             output_path=os.path.join(STATIC_FILE_SERVING_FOLDER, "render_all_views.mp4"),
+                             fps=30,
+                             center_crop=0.85)
+    return os.path.join(STATIC_FILE_SERVING_FOLDER, "render_all_views.mp4")
+def render_circular_path(scene_dir):
+    viewpoint_cams = trainer.GS.scene.getTrainCameras()
+    path_cameras = generate_circular_camera_path(existing_cameras=viewpoint_cams,
+                                                 N=240,
+                                                 radius_scale=0.65,
+                                                 d=0)
+    path_renderings = []
+    with torch.no_grad():
+        for viewpoint_cam in path_cameras:
+            render_pkg = trainer.GS(viewpoint_cam)
+            image = render_pkg["render"]
+            image_np = np.clip(image.detach().cpu().numpy().transpose(1, 2, 0), 0, 1)
+            image_np = (image_np * 255).astype(np.uint8)
+            path_renderings.append(image_np)
+    save_numpy_frames_as_mp4(frames=path_renderings,
+                             output_path=os.path.join(STATIC_FILE_SERVING_FOLDER, "render_circular_path.mp4"),
+                             fps=30,
+                             center_crop=0.85)
+    return os.path.join(STATIC_FILE_SERVING_FOLDER, "render_circular_path.mp4")
+# Download Functions
+def download_cameras():
+    path = os.path.join(MODEL_PATH, "cameras.json")
+    return f"[📥 Download Cameras.json](file={path})"
+def download_model():
+    path = os.path.join(STATIC_FILE_SERVING_FOLDER, "point_cloud_final.ply")
+    return f"[📥 Download Pretrained Model (.ply)](file={path})"
+# Full pipeline helpers
+def run_full_pipeline(input_path, num_ref_views, num_corrs, max_size=1024):
+    tmpdirname = tempfile.mkdtemp()
+    scene_dir = os.path.join(tmpdirname, "scene")
+    os.makedirs(scene_dir, exist_ok=True)
+    selected_frames = process_input(input_path, num_ref_views, scene_dir, max_size)
+    run_colmap_on_scene(scene_dir)
+    return selected_frames, scene_dir
+# Preprocess Input
+def process_input(input_path, num_ref_views, output_dir, max_size=1024):
+    if isinstance(input_path, (str, os.PathLike)):
+        if os.path.isdir(input_path):
+            frames = []
+            for img_file in sorted(os.listdir(input_path)):
+                if img_file.lower().endswith(('jpg', 'jpeg', 'png')):
+                    img = Image.open(os.path.join(output_dir, img_file)).convert('RGB')
+                    img.thumbnail((1024, 1024))
+                    frames.append(np.array(img))
+        else:
+            frames = read_video_frames(video_input=input_path, max_size=max_size)
+    else:
+        frames = read_video_frames(video_input=input_path, max_size=max_size)
+    frames_scores = preprocess_frames(frames)
+    selected_frames_indices = select_optimal_frames(scores=frames_scores,
+                                                    k=min(num_ref_views, len(frames)))
+    selected_frames = [frames[frame_idx] for frame_idx in selected_frames_indices]
+    save_frames_to_scene_dir(frames=selected_frames, scene_dir=output_dir)
+    return selected_frames
+def preprocess_input(input_path, num_ref_views, max_size=1024):
+    tmpdirname = tempfile.mkdtemp()
+    scene_dir = os.path.join(tmpdirname, "scene")
+    os.makedirs(scene_dir, exist_ok=True)
+    selected_frames = process_input(input_path, num_ref_views, scene_dir, max_size)
+    run_colmap_on_scene(scene_dir)
+    return selected_frames, scene_dir
+def start_training(scene_dir, num_ref_views, num_corrs, num_steps):
+    return capture_logs(run_training_pipeline, scene_dir, num_ref_views, num_corrs, num_steps)
+# Gradio App
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column(scale=6):
+            gr.Markdown("""
+            ## <span style='font-size: 20px;'>📄 EDGS: Eliminating Densification for Efficient Convergence of 3DGS</span>
+            🔗 <a href='https://compvis.github.io/EDGS' target='_blank'>Project Page</a>
+            """, elem_id="header")
+    gr.Markdown("""
+                ### <span style='font-size: 22px;'>🛠️ How to Use This Demo</span>
+                1. Upload a **front-facing video** or **a folder of images** of a **static** scene.
+                2. Use the sliders to configure the number of reference views, correspondences, and optimization steps.
+                3. First press on preprocess Input to extract frames from video(for videos) and COLMAP frames.
+                4. Then click **🚀 Start Reconstruction** to actually launch the reconstruction pipeline.
+                5. Watch the training visualization and explore the 3D model.
+                ‼️ **If you see nothing in the 3D model viewer**, try rotating or zooming — sometimes the initial camera orientation is off.
+                ✅ Best for scenes with small camera motion.
+                ❗ For full 360° or large-scale scenes, we recommend the Colab version (see project page).
+                """, elem_id="quickstart")
+    scene_dir_state = gr.State()
+    ply_model_state = gr.State()
+    with gr.Row():
+        with gr.Column(scale=2):
+            input_file = gr.File(label="Upload Video or Images",
+                file_types=[".mp4", ".avi", ".mov", ".png", ".jpg", ".jpeg"],
+                file_count="multiple")
+            gr.Examples(
+                examples = [
+                    [["assets/examples/video_bakery.mp4"]],
+                    [["assets/examples/video_flowers.mp4"]],
+                    [["assets/examples/video_fruits.mp4"]],
+                    [["assets/examples/video_plant.mp4"]],
+                    [["assets/examples/video_salad.mp4"]],
+                    [["assets/examples/video_tram.mp4"]],
+                    [["assets/examples/video_tulips.mp4"]]
+                    ],
+                inputs=[input_file],
+                label="🎞️ ALternatively, try an Example Video",
+                examples_per_page=4
+            )
+            ref_slider = gr.Slider(4, 32, value=16, step=1, label="Number of Reference Views")
+            corr_slider = gr.Slider(5000, 30000, value=20000, step=1000, label="Correspondences per Reference View")
+            fit_steps_slider = gr.Slider(100, 5000, value=400, step=100, label="Number of optimization steps")
+            preprocess_button = gr.Button("📸 Preprocess Input")
+            start_button = gr.Button("🚀 Start Reconstruction", interactive=False)
+            gallery = gr.Gallery(label="Selected Reference Views", columns=4, height=300)
+        with gr.Column(scale=3):
+            gr.Markdown("### 🏋️ Training Visualization")
+            video_output = gr.Video(label="Training Video", autoplay=True)
+            render_all_views_button = gr.Button("🎥 Render All-Views Path")
+            render_circular_path_button = gr.Button("🎥 Render Circular Path")
+            rendered_video_output = gr.Video(label="Rendered Video", autoplay=True)
+        with gr.Column(scale=5):
+            gr.Markdown("### 🌐 Final 3D Model")
+            model3d_viewer = gr.Model3D(label="3D Model Viewer")
+            gr.Markdown("### 📦 Output Files")
+            with gr.Row(height=50):
+                with gr.Column():
+                    #gr.Markdown(value=f"[📥 Download .ply](file/point_cloud_final.ply)")
+                    download_cameras_button = gr.Button("📥 Download Cameras.json")
+                    download_cameras_file = gr.File(label="📄 Cameras.json")
+                with gr.Column():
+                    download_model_button = gr.Button("📥 Download Pretrained Model (.ply)")
+                    download_model_file = gr.File(label="📄 Pretrained Model (.ply)")
+    log_output_box = gr.Textbox(label="🖥️ Log", lines=10, interactive=False)
+    def on_preprocess_click(input_file, num_ref_views):
+        images, scene_dir = preprocess_input(input_file, num_ref_views)
+        return gr.update(value=[x[...,::-1] for x in images]), scene_dir, gr.update(interactive=True)
+    def on_start_click(scene_dir, num_ref_views, num_corrs, num_steps):
+        (video_path, ply_path), logs = start_training(scene_dir, num_ref_views, num_corrs, num_steps)
+        return video_path, ply_path, logs
+    preprocess_button.click(
+        fn=on_preprocess_click,
+        inputs=[input_file, ref_slider],
+        outputs=[gallery, scene_dir_state, start_button]
+    )
+    start_button.click(
+        fn=on_start_click,
+        inputs=[scene_dir_state, ref_slider, corr_slider, fit_steps_slider],
+        outputs=[video_output, model3d_viewer, log_output_box]
+    )
+    render_all_views_button.click(fn=render_all_views, inputs=[scene_dir_state], outputs=[rendered_video_output])
+    render_circular_path_button.click(fn=render_circular_path, inputs=[scene_dir_state], outputs=[rendered_video_output])
+    download_cameras_button.click(fn=lambda: os.path.join(MODEL_PATH, "cameras.json"), inputs=[], outputs=[download_cameras_file])
+    download_model_button.click(fn=lambda: os.path.join(STATIC_FILE_SERVING_FOLDER, "point_cloud_final.ply"), inputs=[], outputs=[download_model_file])
+    gr.Markdown("""
+    ---
+    ### <span style='font-size: 20px;'>📖 Detailed Overview</span>
+    If you uploaded a video, it will be automatically cut into a smaller number of frames (default: 16).
+    The model pipeline:
+    1. 🧠 Runs PyCOLMAP to estimate camera intrinsics & poses (~3–7 seconds for <16 images).
+    2. 🔁 Computes 2D-2D correspondences between views. More correspondences generally improve quality.
+    3. 🔧 Optimizes a 3D Gaussian Splatting model for several steps.
+    ### 🎥 Training Visualization
+    You will see a visualization of the entire training process in the "Training Video" pane.
+    ### 🌀 Rendering & 3D Model
+    - Render the scene from a circular path of novel views.
+    - Or from camera views close to the original input.
+    The 3D model is shown in the right viewer. You can explore it interactively:
+    - On PC: WASD keys, arrow keys, and mouse clicks
+    - On mobile: pan and pinch to zoom
+    🕒 Note: the 3D viewer takes a few extra seconds (~5s) to display after training ends.
+    ---
+    Preloaded models coming soon. (TODO)
+    """, elem_id="details")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Launch Gradio demo for EDGS preprocessing and 3D viewing.")
+    parser.add_argument("--port", type=int, default=7860, help="Port to launch the Gradio app on.")
+    parser.add_argument("--no_share", action='store_true', help="Disable Gradio sharing and assume local access (default: share=True)")
+    args = parser.parse_args()
+    demo.launch(server_name="0.0.0.0", server_port=args.port, share=not args.no_share)

install.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/bin/bash
+git clone [email protected]:CompVis/EDGS.git --recursive
+cd EDGS
+git submodule update --init --recursive
+conda create -y -n edgs python=3.10 pip
+conda activate edgs
+# Optionally set path to CUDA
+export CUDA_HOME=/usr/local/cuda-12.1
+export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+export PATH=$CUDA_HOME/bin:$PATH
+conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y
+conda install nvidia/label/cuda-12.1.0::cuda-toolkit -y
+pip install -e submodules/gaussian-splatting/submodules/diff-gaussian-rasterization
+pip install -e submodules/gaussian-splatting/submodules/simple-knn
+# For COLMAP and pycolmap
+# Optionally install original colmap but probably pycolmap suffices
+# conda install conda-forge/label/colmap_dev::colmap
+pip install pycolmap
+pip install wandb hydra-core tqdm torchmetrics lpips matplotlib rich plyfile imageio imageio-ffmpeg
+conda install numpy=1.26.4 -y -c conda-forge --override-channels
+pip install -e submodules/RoMa
+conda install anaconda::jupyter --yes
+# Stuff necessary for gradio and visualizations
+pip install gradio
+pip install plotly scikit-learn moviepy==2.1.1 ffmpeg
+pip install open3d

main.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import torch
+import os
+import shutil
+import tempfile
+import uuid
+import asyncio
+import io
+import time
+import contextlib
+import base64
+from PIL import Image
+import numpy as np
+from fastapi import FastAPI, UploadFile, File, HTTPException, Body
+from fastapi.responses import JSONResponse, StreamingResponse, FileResponse
+from pydantic import BaseModel, Field
+try:
+    from source.utils_aux import set_seed
+    from source.utils_preprocess import read_video_frames, preprocess_frames, select_optimal_frames, save_frames_to_scene_dir, run_colmap_on_scene
+    from source.trainer import EDGSTrainer
+    from hydra import initialize, compose
+    import hydra
+    from source.visualization import generate_fully_smooth_cameras_with_tsp, put_text_on_image
+    import sys
+    sys.path.append('../submodules/RoMa') # Ajusta esta ruta si es necesario
+    from romatch import roma_indoor
+except ImportError as e:
+    print(f"Error: No se pudieron importar los módulos del proyecto EDGS. Asegúrate de que las rutas y la instalación son correctas. {e}")
+    sys.exit(1)
+# --- Configuración Inicial ---
+# 1. Inicialización de la App FastAPI
+app = FastAPI(
+    title="EDGS Training API",
+    description="Una API para preprocesar videos y entrenar modelos 3DGS con EDGS.",
+    version="1.0.0"
+)
+# 2. Variables Globales y Almacenamiento de Estado
+# El modelo se cargará en el evento 'startup'
+roma_model = None
+# Base de datos en memoria para gestionar el estado de las tareas entre endpoints
+tasks_db = {}
+# 3. Modelos Pydantic para la validación de datos
+class TrainParams(BaseModel):
+    num_corrs_per_view: int = Field(20000, gt=0, description="Correspondencias por vista de referencia.")
+    num_steps: int = Field(1000, gt=0, description="Número de pasos de optimización.")
+class PreprocessResponse(BaseModel):
+    task_id: str
+    message: str
+    selected_frames_count: int
+    # Opcional: podrías devolver las imágenes en base64 si el cliente las necesita visualizar
+    # frames: list[str]
+# --- Lógica de Negocio (Adaptada del script de Gradio) ---
+# Esta función se ejecutará en un hilo separado para no bloquear el servidor
+def run_preprocessing_sync(input_path: str, num_ref_views: int):
+    """
+    Ejecuta el preprocesamiento: selección de frames y ejecución de COLMAP.
+    """
+    tmpdirname = tempfile.mkdtemp()
+    scene_dir = os.path.join(tmpdirname, "scene")
+    os.makedirs(scene_dir, exist_ok=True)
+    # 1. Lee y selecciona los mejores frames
+    frames = read_video_frames(video_input=input_path, max_size=1024)
+    frames_scores = preprocess_frames(frames)
+    selected_frames_indices = select_optimal_frames(scores=frames_scores, k=min(num_ref_views, len(frames)))
+    selected_frames = [frames[frame_idx] for frame_idx in selected_frames_indices]
+    # 2. Guarda los frames y ejecuta COLMAP
+    save_frames_to_scene_dir(frames=selected_frames, scene_dir=scene_dir)
+    run_colmap_on_scene(scene_dir)
+    return scene_dir, selected_frames
+async def training_log_generator(scene_dir: str, num_ref_views: int, params: TrainParams, task_id: str):
+    """
+    Un generador asíncrono que ejecuta el entrenamiento. Los logs detallados se muestran
+    en la terminal del servidor, mientras que el cliente recibe un stream de progreso simple.
+    """
+    def training_pipeline():
+        try:
+            # La inicialización y configuración de Hydra se mantienen igual
+            with initialize(config_path="./configs", version_base="1.1"):
+                cfg = compose(config_name="train")
+            # --- CONFIGURACIÓN COMPLETA ---
+            scene_name = os.path.basename(scene_dir)
+            model_output_dir = f"./outputs/{scene_name}_trained"
+            cfg.wandb.mode = "disabled"
+            cfg.gs.dataset.model_path = model_output_dir
+            cfg.gs.dataset.source_path = scene_dir
+            cfg.gs.dataset.images = "images"
+            cfg.train.gs_epochs = 30000
+            cfg.gs.opt.opacity_reset_interval = 1_000_000
+            cfg.train.reduce_opacity = True
+            cfg.train.no_densify = True
+            cfg.train.max_lr = True
+            cfg.init_wC.use = True
+            cfg.init_wC.matches_per_ref = params.num_corrs_per_view
+            cfg.init_wC.nns_per_ref = 1
+            cfg.init_wC.num_refs = num_ref_views
+            cfg.init_wC.add_SfM_init = False
+            cfg.init_wC.scaling_factor = 0.00077 * 2.
+            set_seed(cfg.seed)
+            os.makedirs(cfg.gs.dataset.model_path, exist_ok=True)
+            device = cfg.device
+            generator3dgs = hydra.utils.instantiate(cfg.gs, do_train_test_split=False)
+            trainer = EDGSTrainer(GS=generator3dgs, training_config=cfg.gs.opt, device=device, log_wandb=False)
+            trainer.saving_iterations = []
+            trainer.evaluate_iterations = []
+            trainer.timer.start()
+            # Mensaje de progreso para el cliente antes de la inicialización
+            yield "data: Inicializando modelo...\n\n"
+            trainer.init_with_corr(cfg.init_wC, roma_model=roma_model)
+            # El bucle de entrenamiento principal
+            for step in range(int(params.num_steps // 10)):
+                cfg.train.gs_epochs = 10
+                # trainer.train() ahora imprimirá sus logs detallados directamente en la terminal
+                trainer.train(cfg.train)
+                # --- CAMBIO CLAVE ---
+                # Envía un mensaje de progreso simple al cliente en lugar de los logs capturados.
+                yield f"data: Progreso: {step*10+10}/{params.num_steps} pasos completados.\n\n"
+            trainer.save_model()
+            ply_path = os.path.join(cfg.gs.dataset.model_path, f"point_cloud/iteration_{trainer.gs_step}/point_cloud.ply")
+            tasks_db[task_id]['result_ply_path'] = ply_path
+            final_message = "Entrenamiento completado. El modelo está listo para descargar."
+            yield f"data: {final_message}\n\n"
+        except Exception as e:
+            yield f"data: ERROR: {repr(e)}\n\n"
+    # El bucle que llama a la pipeline se mantiene igual
+    training_gen = training_pipeline()
+    for log_message in training_gen:
+        yield log_message
+        await asyncio.sleep(0.1)
+# --- Eventos de Ciclo de Vida de la App ---
+@app.on_event("startup")
+async def startup_event():
+    """
+    Carga el modelo RoMa cuando el servidor se inicia.
+    """
+    global roma_model
+    print("🚀 Iniciando servidor FastAPI...")
+    if torch.cuda.is_available():
+        device = "cuda:0"
+        print("✅ GPU detectada. Usando CUDA.")
+    else:
+        device = "cpu"
+        print("⚠️ No se detectó GPU. Usando CPU (puede ser muy lento).")
+    roma_model = roma_indoor(device=device)
+    roma_model.upsample_preds = False
+    roma_model.symmetric = False
+    print("🤖 Modelo RoMa cargado y listo.")
+# --- Endpoints de la API ---
+@app.post("/preprocess", response_model=PreprocessResponse)
+async def preprocess_video(
+    num_ref_views: int = Body(16, embed=True, description="Número de vistas de referencia a extraer del video."),
+    video: UploadFile = File(..., description="Archivo de video a procesar (.mp4, .mov).")
+):
+    """
+    Recibe un video, lo preprocesa (extrae frames + COLMAP) y prepara para el entrenamiento.
+    """
+    if not video.filename.lower().endswith(('.mp4', '.avi', '.mov')):
+        raise HTTPException(status_code=400, detail="Formato de archivo no soportado. Usa .mp4, .avi, o .mov.")
+    # Guarda el video temporalmente para que la librería pueda procesarlo
+    with tempfile.NamedTemporaryFile(delete=False, suffix=video.filename) as tmp_video:
+        shutil.copyfileobj(video.file, tmp_video)
+        tmp_video_path = tmp_video.name
+    try:
+        loop = asyncio.get_running_loop()
+        # Ejecuta la función síncrona y bloqueante en un executor para no bloquear el servidor
+        scene_dir, selected_frames = await loop.run_in_executor(
+            None, run_preprocessing_sync, tmp_video_path, num_ref_views
+        )
+        # Genera un ID único para esta tarea y guarda la ruta
+        task_id = str(uuid.uuid4())
+        tasks_db[task_id] = {
+            "scene_dir": scene_dir,
+            "num_ref_views": len(selected_frames),
+            "result_ply_path": None
+        }
+        return JSONResponse(
+            status_code=200,
+            content={
+                "task_id": task_id,
+                "message": f"Preprocesamiento completado. Se generó el directorio de la escena. Listo para entrenar.",
+                "selected_frames_count": len(selected_frames)
+            }
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error durante el preprocesamiento: {e}")
+    finally:
+        os.unlink(tmp_video_path) # Limpia el archivo de video temporal
+@app.post("/train/{task_id}")
+async def train_model(task_id: str, params: TrainParams):
+    """
+    Inicia el entrenamiento para una tarea preprocesada.
+    Devuelve un stream de logs en tiempo real.
+    """
+    if task_id not in tasks_db:
+        raise HTTPException(status_code=404, detail="Task ID no encontrado. Por favor, ejecuta el preprocesamiento primero.")
+    task_info = tasks_db[task_id]
+    scene_dir = task_info["scene_dir"]
+    num_ref_views = task_info["num_ref_views"]
+    return StreamingResponse(
+        training_log_generator(scene_dir, num_ref_views, params, task_id),
+        media_type="text/event-stream"
+    )
+@app.get("/download/{task_id}")
+async def download_ply_file(task_id: str):
+    """
+    Permite descargar el archivo .ply resultante de un entrenamiento completado.
+    """
+    if task_id not in tasks_db:
+        raise HTTPException(status_code=404, detail="Task ID no encontrado.")
+    task_info = tasks_db[task_id]
+    ply_path = task_info.get("result_ply_path")
+    if not ply_path:
+        raise HTTPException(status_code=404, detail="El entrenamiento no ha finalizado o el archivo aún no está disponible.")
+    if not os.path.exists(ply_path):
+        raise HTTPException(status_code=500, detail="Error: El archivo del modelo no se encuentra en el servidor.")
+    # Generamos un nombre de archivo amigable para el usuario
+    file_name = f"model_{task_id[:8]}.ply"
+    return FileResponse(
+        path=ply_path,
+        media_type='application/octet-stream',
+        filename=file_name
+    )
+if __name__ == "__main__":
+    import uvicorn
+    # Para ejecutar: uvicorn main:app --reload
+    # El flag --reload es para desarrollo. Quítalo en producción.
+    uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=False)

metrics.py ADDED Viewed

	@@ -0,0 +1,115 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+from pathlib import Path
+import os
+import sys
+from PIL import Image
+import torch
+import torchvision.transforms.functional as tf
+sys.path.append('./submodules/gaussian-splatting/')
+from utils.loss_utils import ssim
+from lpipsPyTorch import lpips as lpips_3dgs
+import json
+from tqdm import tqdm
+from utils.image_utils import psnr
+from argparse import ArgumentParser
+import lpips
+def readImages(renders_dir, gt_dir):
+    renders = []
+    gts = []
+    image_names = []
+    for fname in os.listdir(renders_dir):
+        render = Image.open(renders_dir / fname)
+        gt = Image.open(gt_dir / fname)
+        renders.append(tf.to_tensor(render).unsqueeze(0)[:, :3, :, :].cuda())
+        gts.append(tf.to_tensor(gt).unsqueeze(0)[:, :3, :, :].cuda())
+        image_names.append(fname)
+    return renders, gts, image_names
+def evaluate(model_paths):
+    full_dict = {}
+    per_view_dict = {}
+    full_dict_polytopeonly = {}
+    per_view_dict_polytopeonly = {}
+    print("")
+    for scene_dir in model_paths:
+        #try:
+            print("Scene:", scene_dir)
+            full_dict[scene_dir] = {}
+            per_view_dict[scene_dir] = {}
+            full_dict_polytopeonly[scene_dir] = {}
+            per_view_dict_polytopeonly[scene_dir] = {}
+            test_dir = Path(scene_dir) / "test"
+            for method in os.listdir(test_dir):
+                print("Method:", method)
+                full_dict[scene_dir][method] = {}
+                per_view_dict[scene_dir][method] = {}
+                full_dict_polytopeonly[scene_dir][method] = {}
+                per_view_dict_polytopeonly[scene_dir][method] = {}
+                method_dir = test_dir / method
+                gt_dir = method_dir/ "gt"
+                renders_dir = method_dir / "renders"
+                renders, gts, image_names = readImages(renders_dir, gt_dir)
+                ssims = []
+                psnrs = []
+                lpipss = []
+                lpipss_3dgs = []
+                with torch.no_grad():
+                    for idx in tqdm(range(len(renders)), desc="Metric evaluation progress"):
+                        ssims.append(ssim(renders[idx], gts[idx]))
+                        psnrs.append(psnr(renders[idx], gts[idx]))
+                        lpipss.append(lpips_fn(renders[idx], gts[idx]))
+                        lpipss_3dgs.append(lpips_3dgs(renders[idx], gts[idx], net_type='vgg'))
+                        torch.cuda.empty_cache()
+                print("  SSIM : {:>12.7f}".format(torch.tensor(ssims).mean(), ".5"))
+                print("  PSNR : {:>12.7f}".format(torch.tensor(psnrs).mean(), ".5"))
+                print("  LPIPS: {:>12.7f}".format(torch.tensor(lpipss).mean(), ".5"))
+                print("  LPIPS_3dgs: {:>12.7f}".format(torch.tensor(lpipss_3dgs).mean(), ".5"))
+                print("")
+                full_dict[scene_dir][method].update({"SSIM": torch.tensor(ssims).mean().item(),
+                                                        "PSNR": torch.tensor(psnrs).mean().item(),
+                                                        "LPIPS": torch.tensor(lpipss).mean().item(),
+                                                        "LPIPS_3dgs": torch.tensor(lpipss_3dgs).mean().item(),
+                                                        })
+                per_view_dict[scene_dir][method].update({"SSIM": {name: ssim for ssim, name in zip(torch.tensor(ssims).tolist(), image_names)},
+                                                            "PSNR": {name: psnr for psnr, name in zip(torch.tensor(psnrs).tolist(), image_names)},
+                                                            "LPIPS": {name: lp for lp, name in zip(torch.tensor(lpipss).tolist(), image_names)},
+                                                            "LPIPS_3dgs": {name: lp for lp, name in zip(torch.tensor(lpipss_3dgs).tolist(), image_names)},
+                                                          })
+            with open(scene_dir + "/results.json", 'w') as fp:
+                json.dump(full_dict[scene_dir], fp, indent=True)
+            with open(scene_dir + "/per_view.json", 'w') as fp:
+                json.dump(per_view_dict[scene_dir], fp, indent=True)
+        #except:
+        #   print("Unable to compute metrics for model", scene_dir)
+if __name__ == "__main__":
+    device = torch.device("cuda:0")
+    torch.cuda.set_device(device)
+    lpips_fn = lpips.LPIPS(net='vgg').to(device)
+    # Set up command line argument parser
+    parser = ArgumentParser(description="Training script parameters")
+    parser.add_argument('--model_paths', '-m', required=True, nargs="+", type=str, default=[])
+    args = parser.parse_args()
+    evaluate(args.model_paths)

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ fastapi

script.bash ADDED Viewed

File without changes

source/EDGS.code-workspace ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+	"folders": [
+		{
+			"path": ".."
+		},
+		{
+			"path": "../../../../.."
+		}
+	],
+	"settings": {}
+}

source/__init__.py ADDED Viewed

File without changes

source/corr_init.py ADDED Viewed

	@@ -0,0 +1,907 @@

+import sys
+sys.path.append('../')
+sys.path.append("../submodules")
+sys.path.append('../submodules/RoMa')
+from matplotlib import pyplot as plt
+from PIL import Image
+import torch
+import numpy as np
+#from tqdm import tqdm_notebook as tqdm
+from tqdm import tqdm
+from scipy.cluster.vq import kmeans, vq
+from scipy.spatial.distance import cdist
+import torch.nn.functional as F
+from romatch import roma_outdoor, roma_indoor
+from utils.sh_utils import RGB2SH
+from romatch.utils import get_tuple_transform_ops
+import time
+from collections import defaultdict
+from tqdm import tqdm
+def pairwise_distances(matrix):
+    """
+    Computes the pairwise Euclidean distances between all vectors in the input matrix.
+    Args:
+        matrix (torch.Tensor): Input matrix of shape [N, D], where N is the number of vectors and D is the dimensionality.
+    Returns:
+        torch.Tensor: Pairwise distance matrix of shape [N, N].
+    """
+    # Compute squared pairwise distances
+    squared_diff = torch.cdist(matrix, matrix, p=2)
+    return squared_diff
+def k_closest_vectors(matrix, k):
+    """
+    Finds the k-closest vectors for each vector in the input matrix based on Euclidean distance.
+    Args:
+        matrix (torch.Tensor): Input matrix of shape [N, D], where N is the number of vectors and D is the dimensionality.
+        k (int): Number of closest vectors to return for each vector.
+    Returns:
+        torch.Tensor: Indices of the k-closest vectors for each vector, excluding the vector itself.
+    """
+    # Compute pairwise distances
+    distances = pairwise_distances(matrix)
+    # For each vector, sort distances and get the indices of the k-closest vectors (excluding itself)
+    # Set diagonal distances to infinity to exclude the vector itself from the nearest neighbors
+    distances.fill_diagonal_(float('inf'))
+    # Get the indices of the k smallest distances (k-closest vectors)
+    _, indices = torch.topk(distances, k, largest=False, dim=1)
+    return indices
+def select_cameras_kmeans(cameras, K):
+    """
+    Selects K cameras from a set using K-means clustering.
+    Args:
+        cameras: NumPy array of shape (N, 16), representing N cameras with their 4x4 homogeneous matrices flattened.
+        K: Number of clusters (cameras to select).
+    Returns:
+        selected_indices: List of indices of the cameras closest to the cluster centers.
+    """
+    # Ensure input is a NumPy array
+    if not isinstance(cameras, np.ndarray):
+        cameras = np.asarray(cameras)
+    if cameras.shape[1] != 16:
+        raise ValueError("Each camera must have 16 values corresponding to a flattened 4x4 matrix.")
+    # Perform K-means clustering
+    cluster_centers, _ = kmeans(cameras, K)
+    # Assign each camera to a cluster and find distances to cluster centers
+    cluster_assignments, _ = vq(cameras, cluster_centers)
+    # Find the camera nearest to each cluster center
+    selected_indices = []
+    for k in range(K):
+        cluster_members = cameras[cluster_assignments == k]
+        distances = cdist([cluster_centers[k]], cluster_members)[0]
+        nearest_camera_idx = np.where(cluster_assignments == k)[0][np.argmin(distances)]
+        selected_indices.append(nearest_camera_idx)
+    return selected_indices
+def compute_warp_and_confidence(viewpoint_cam1, viewpoint_cam2, roma_model, device="cuda", verbose=False, output_dict={}):
+    """
+    Computes the warp and confidence between two viewpoint cameras using the roma_model.
+    Args:
+        viewpoint_cam1: Source viewpoint camera.
+        viewpoint_cam2: Target viewpoint camera.
+        roma_model: Pre-trained Roma model for correspondence matching.
+        device: Device to run the computation on.
+        verbose: If True, displays the images.
+    Returns:
+        certainty: Confidence tensor.
+        warp: Warp tensor.
+        imB: Processed image B as numpy array.
+    """
+    # Prepare images
+    imA = viewpoint_cam1.original_image.detach().cpu().numpy().transpose(1, 2, 0)
+    imB = viewpoint_cam2.original_image.detach().cpu().numpy().transpose(1, 2, 0)
+    imA = Image.fromarray(np.clip(imA * 255, 0, 255).astype(np.uint8))
+    imB = Image.fromarray(np.clip(imB * 255, 0, 255).astype(np.uint8))
+    if verbose:
+        fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 8))
+        cax1 = ax[0].imshow(imA)
+        ax[0].set_title("Image 1")
+        cax2 = ax[1].imshow(imB)
+        ax[1].set_title("Image 2")
+        fig.colorbar(cax1, ax=ax[0])
+        fig.colorbar(cax2, ax=ax[1])
+        for axis in ax:
+            axis.axis('off')
+        # Save the figure into the dictionary
+        output_dict[f'image_pair'] = fig
+    # Transform images
+    ws, hs = roma_model.w_resized, roma_model.h_resized
+    test_transform = get_tuple_transform_ops(resize=(hs, ws), normalize=True)
+    im_A, im_B = test_transform((imA, imB))
+    batch = {"im_A": im_A[None].to(device), "im_B": im_B[None].to(device)}
+    # Forward pass through Roma model
+    corresps = roma_model.forward(batch) if not roma_model.symmetric else roma_model.forward_symmetric(batch)
+    finest_scale = 1
+    hs, ws = roma_model.upsample_res if roma_model.upsample_preds else (hs, ws)
+    # Process certainty and warp
+    certainty = corresps[finest_scale]["certainty"]
+    im_A_to_im_B = corresps[finest_scale]["flow"]
+    if roma_model.attenuate_cert:
+        low_res_certainty = F.interpolate(
+            corresps[16]["certainty"], size=(hs, ws), align_corners=False, mode="bilinear"
+        )
+        certainty -= 0.5 * low_res_certainty * (low_res_certainty < 0)
+    # Upsample predictions if needed
+    if roma_model.upsample_preds:
+        im_A_to_im_B = F.interpolate(
+            im_A_to_im_B, size=(hs, ws), align_corners=False, mode="bilinear"
+        )
+        certainty = F.interpolate(
+            certainty, size=(hs, ws), align_corners=False, mode="bilinear"
+        )
+    # Convert predictions to final format
+    im_A_to_im_B = im_A_to_im_B.permute(0, 2, 3, 1)
+    im_A_coords = torch.stack(torch.meshgrid(
+        torch.linspace(-1 + 1 / hs, 1 - 1 / hs, hs, device=device),
+        torch.linspace(-1 + 1 / ws, 1 - 1 / ws, ws, device=device),
+        indexing='ij'
+    ), dim=0).permute(1, 2, 0).unsqueeze(0).expand(im_A_to_im_B.size(0), -1, -1, -1)
+    warp = torch.cat((im_A_coords, im_A_to_im_B), dim=-1)
+    certainty = certainty.sigmoid()
+    return certainty[0, 0], warp[0], np.array(imB)
+def resize_batch(tensors_3d, tensors_4d, target_shape):
+    """
+    Resizes a batch of tensors with shapes [B, H, W] and [B, H, W, 4] to the target spatial dimensions.
+    Args:
+        tensors_3d: Tensor of shape [B, H, W].
+        tensors_4d: Tensor of shape [B, H, W, 4].
+        target_shape: Tuple (target_H, target_W) specifying the target spatial dimensions.
+    Returns:
+        resized_tensors_3d: Tensor of shape [B, target_H, target_W].
+        resized_tensors_4d: Tensor of shape [B, target_H, target_W, 4].
+    """
+    target_H, target_W = target_shape
+    # Resize [B, H, W] tensor
+    resized_tensors_3d = F.interpolate(
+        tensors_3d.unsqueeze(1), size=(target_H, target_W), mode="bilinear", align_corners=False
+    ).squeeze(1)
+    # Resize [B, H, W, 4] tensor
+    B, _, _, C = tensors_4d.shape
+    resized_tensors_4d = F.interpolate(
+        tensors_4d.permute(0, 3, 1, 2), size=(target_H, target_W), mode="bilinear", align_corners=False
+    ).permute(0, 2, 3, 1)
+    return resized_tensors_3d, resized_tensors_4d
+def aggregate_confidences_and_warps(viewpoint_stack, closest_indices, roma_model, source_idx, verbose=False, output_dict={}):
+    """
+    Aggregates confidences and warps by iterating over the nearest neighbors of the source viewpoint.
+    Args:
+        viewpoint_stack: Stack of viewpoint cameras.
+        closest_indices: Indices of the nearest neighbors for each viewpoint.
+        roma_model: Pre-trained Roma model.
+        source_idx: Index of the source viewpoint.
+        verbose: If True, displays intermediate results.
+    Returns:
+        certainties_max: Aggregated maximum confidences.
+        warps_max: Aggregated warps corresponding to maximum confidences.
+        certainties_max_idcs: Pixel-wise index of the image  from which we taken the best matching.
+        imB_compound: List of the neighboring images.
+    """
+    certainties_all, warps_all, imB_compound = [], [], []
+    for nn in tqdm(closest_indices[source_idx]):
+        viewpoint_cam1 = viewpoint_stack[source_idx]
+        viewpoint_cam2 = viewpoint_stack[nn]
+        certainty, warp, imB = compute_warp_and_confidence(viewpoint_cam1, viewpoint_cam2, roma_model, verbose=verbose, output_dict=output_dict)
+        certainties_all.append(certainty)
+        warps_all.append(warp)
+        imB_compound.append(imB)
+    certainties_all = torch.stack(certainties_all, dim=0)
+    target_shape = imB_compound[0].shape[:2]
+    if verbose:
+        print("certainties_all.shape:", certainties_all.shape)
+        print("torch.stack(warps_all, dim=0).shape:", torch.stack(warps_all, dim=0).shape)
+        print("target_shape:", target_shape)
+    certainties_all_resized, warps_all_resized = resize_batch(certainties_all,
+                                                              torch.stack(warps_all, dim=0),
+                                                              target_shape
+                                                              )
+    if verbose:
+        print("warps_all_resized.shape:", warps_all_resized.shape)
+        for n, cert in enumerate(certainties_all):
+            fig, ax = plt.subplots()
+            cax = ax.imshow(cert.cpu().numpy(), cmap='viridis')
+            fig.colorbar(cax, ax=ax)
+            ax.set_title("Pixel-wise Confidence")
+            output_dict[f'certainty_{n}'] = fig
+        for n, warp in enumerate(warps_all):
+            fig, ax = plt.subplots()
+            cax = ax.imshow(warp.cpu().numpy()[:, :, :3], cmap='viridis')
+            fig.colorbar(cax, ax=ax)
+            ax.set_title("Pixel-wise warp")
+            output_dict[f'warp_resized_{n}'] = fig
+        for n, cert in enumerate(certainties_all_resized):
+            fig, ax = plt.subplots()
+            cax = ax.imshow(cert.cpu().numpy(), cmap='viridis')
+            fig.colorbar(cax, ax=ax)
+            ax.set_title("Pixel-wise Confidence resized")
+            output_dict[f'certainty_resized_{n}'] = fig
+        for n, warp in enumerate(warps_all_resized):
+            fig, ax = plt.subplots()
+            cax = ax.imshow(warp.cpu().numpy()[:, :, :3], cmap='viridis')
+            fig.colorbar(cax, ax=ax)
+            ax.set_title("Pixel-wise warp resized")
+            output_dict[f'warp_resized_{n}'] = fig
+    certainties_max, certainties_max_idcs = torch.max(certainties_all_resized, dim=0)
+    H, W = certainties_max.shape
+    warps_max = warps_all_resized[certainties_max_idcs, torch.arange(H).unsqueeze(1), torch.arange(W)]
+    imA = viewpoint_cam1.original_image.detach().cpu().numpy().transpose(1, 2, 0)
+    imA = np.clip(imA * 255, 0, 255).astype(np.uint8)
+    return certainties_max, warps_max, certainties_max_idcs, imA, imB_compound, certainties_all_resized, warps_all_resized
+def extract_keypoints_and_colors(imA, imB_compound, certainties_max, certainties_max_idcs, matches, roma_model,
+                                 verbose=False, output_dict={}):
+    """
+    Extracts keypoints and corresponding colors from the source image (imA) and multiple target images (imB_compound).
+    Args:
+        imA: Source image as a NumPy array (H_A, W_A, C).
+        imB_compound: List of target images as NumPy arrays [(H_B, W_B, C), ...].
+        certainties_max: Tensor of pixel-wise maximum confidences.
+        certainties_max_idcs: Tensor of pixel-wise indices for the best matches.
+        matches: Matches in normalized coordinates.
+        roma_model: Roma model instance for keypoint operations.
+        verbose: if to show intermediate outputs and visualize results
+    Returns:
+        kptsA_np: Keypoints in imA in normalized coordinates.
+        kptsB_np: Keypoints in imB in normalized coordinates.
+        kptsA_color: Colors of keypoints in imA.
+        kptsB_color: Colors of keypoints in imB based on certainties_max_idcs.
+    """
+    H_A, W_A, _ = imA.shape
+    H, W = certainties_max.shape
+    # Convert matches to pixel coordinates
+    kptsA, kptsB = roma_model.to_pixel_coordinates(
+        matches, W_A, H_A, H, W  # W, H
+    )
+    kptsA_np = kptsA.detach().cpu().numpy()
+    kptsB_np = kptsB.detach().cpu().numpy()
+    kptsA_np = kptsA_np[:, [1, 0]]
+    if verbose:
+        fig, ax = plt.subplots(figsize=(12, 6))
+        cax = ax.imshow(imA)
+        ax.set_title("Reference image, imA")
+        output_dict[f'reference_image'] = fig
+        fig, ax = plt.subplots(figsize=(12, 6))
+        cax = ax.imshow(imB_compound[0])
+        ax.set_title("Image to compare to image, imB_compound")
+        output_dict[f'imB_compound'] = fig
+        fig, ax = plt.subplots(figsize=(12, 6))
+        cax = ax.imshow(np.flipud(imA))
+        cax = ax.scatter(kptsA_np[:, 0], H_A - kptsA_np[:, 1], s=.03)
+        ax.set_title("Keypoints in imA")
+        ax.set_xlim(0, W_A)
+        ax.set_ylim(0, H_A)
+        output_dict[f'kptsA'] = fig
+        fig, ax = plt.subplots(figsize=(12, 6))
+        cax = ax.imshow(np.flipud(imB_compound[0]))
+        cax = ax.scatter(kptsB_np[:, 0], H_A - kptsB_np[:, 1], s=.03)
+        ax.set_title("Keypoints in imB")
+        ax.set_xlim(0, W_A)
+        ax.set_ylim(0, H_A)
+        output_dict[f'kptsB'] = fig
+    # Keypoints are in format (row, column) so the first value is alwain in range [0;height] and second is in range[0;width]
+    kptsA_np = kptsA.detach().cpu().numpy()
+    kptsB_np = kptsB.detach().cpu().numpy()
+    # Extract colors for keypoints in imA (vectorized)
+    # New experimental version
+    kptsA_x = np.round(kptsA_np[:, 0] / 1.).astype(int)
+    kptsA_y = np.round(kptsA_np[:, 1] / 1.).astype(int)
+    kptsA_color = imA[np.clip(kptsA_x, 0, H - 1), np.clip(kptsA_y, 0, W - 1)]
+    # Create a composite image from imB_compound
+    imB_compound_np = np.stack(imB_compound, axis=0)
+    H_B, W_B, _ = imB_compound[0].shape
+    # Extract colors for keypoints in imB using certainties_max_idcs
+    imB_np = imB_compound_np[
+            certainties_max_idcs.detach().cpu().numpy(),
+            np.arange(H).reshape(-1, 1),
+            np.arange(W)
+        ]
+    if verbose:
+        print("imB_np.shape:", imB_np.shape)
+        print("imB_np:", imB_np)
+        fig, ax = plt.subplots(figsize=(12, 6))
+        cax = ax.imshow(np.flipud(imB_np))
+        cax = ax.scatter(kptsB_np[:, 0], H_A - kptsB_np[:, 1], s=.03)
+        ax.set_title("np.flipud(imB_np[0]")
+        ax.set_xlim(0, W_A)
+        ax.set_ylim(0, H_A)
+        output_dict[f'np.flipud(imB_np[0]'] = fig
+    kptsB_x = np.round(kptsB_np[:, 0]).astype(int)
+    kptsB_y = np.round(kptsB_np[:, 1]).astype(int)
+    certainties_max_idcs_np = certainties_max_idcs.detach().cpu().numpy()
+    kptsB_proj_matrices_idx = certainties_max_idcs_np[np.clip(kptsA_x, 0, H - 1), np.clip(kptsA_y, 0, W - 1)]
+    kptsB_color = imB_compound_np[kptsB_proj_matrices_idx, np.clip(kptsB_y, 0, H - 1), np.clip(kptsB_x, 0, W - 1)]
+    # Normalize keypoints in both images
+    kptsA_np[:, 0] = kptsA_np[:, 0] / H * 2.0 - 1.0
+    kptsA_np[:, 1] = kptsA_np[:, 1] / W * 2.0 - 1.0
+    kptsB_np[:, 0] = kptsB_np[:, 0] / W_B * 2.0 - 1.0
+    kptsB_np[:, 1] = kptsB_np[:, 1] / H_B * 2.0 - 1.0
+    return kptsA_np[:, [1, 0]], kptsB_np, kptsB_proj_matrices_idx, kptsA_color, kptsB_color
+def prepare_tensor(input_array, device):
+    """
+    Converts an input array to a torch tensor, clones it, and detaches it for safe computation.
+    Args:
+        input_array (array-like): The input array to convert.
+        device (str or torch.device): The device to move the tensor to.
+    Returns:
+        torch.Tensor: A detached tensor clone of the input array on the specified device.
+    """
+    if not isinstance(input_array, torch.Tensor):
+        return torch.tensor(input_array, dtype=torch.float32).to(device).clone().detach()
+    return input_array.clone().detach().to(device).to(torch.float32)
+def triangulate_points(P1, P2, k1_x, k1_y, k2_x, k2_y, device="cuda"):
+    """
+    Solves for a batch of 3D points given batches of projection matrices and corresponding image points.
+    Parameters:
+    - P1, P2: Tensors of projection matrices of size (batch_size, 4, 4) or (4, 4)
+    - k1_x, k1_y: Tensors of shape (batch_size,)
+    - k2_x, k2_y: Tensors of shape (batch_size,)
+    Returns:
+    - X: A tensor containing the 3D homogeneous coordinates, shape (batch_size, 4)
+    """
+    EPS = 1e-4
+    # Ensure inputs are tensors
+    P1 = prepare_tensor(P1, device)
+    P2 = prepare_tensor(P2, device)
+    k1_x = prepare_tensor(k1_x, device)
+    k1_y = prepare_tensor(k1_y, device)
+    k2_x = prepare_tensor(k2_x, device)
+    k2_y =  prepare_tensor(k2_y, device)
+    batch_size = k1_x.shape[0]
+    # Expand P1 and P2 if they are not batched
+    if P1.ndim == 2:
+        P1 = P1.unsqueeze(0).expand(batch_size, -1, -1)
+    if P2.ndim == 2:
+        P2 = P2.unsqueeze(0).expand(batch_size, -1, -1)
+    # Extract columns from P1 and P2
+    P1_0 = P1[:, :, 0]  # Shape: (batch_size, 4)
+    P1_1 = P1[:, :, 1]
+    P1_2 = P1[:, :, 2]
+    P2_0 = P2[:, :, 0]
+    P2_1 = P2[:, :, 1]
+    P2_2 = P2[:, :, 2]
+    # Reshape kx and ky to (batch_size, 1)
+    k1_x = k1_x.view(-1, 1)
+    k1_y = k1_y.view(-1, 1)
+    k2_x = k2_x.view(-1, 1)
+    k2_y = k2_y.view(-1, 1)
+    # Construct the equations for each batch
+    # For camera 1
+    A1 = P1_0 - k1_x * P1_2  # Shape: (batch_size, 4)
+    A2 = P1_1 - k1_y * P1_2
+    # For camera 2
+    A3 = P2_0 - k2_x * P2_2
+    A4 = P2_1 - k2_y * P2_2
+    # Stack the equations
+    A = torch.stack([A1, A2, A3, A4], dim=1)  # Shape: (batch_size, 4, 4)
+    # Right-hand side (constants)
+    b = -A[:, :, 3]  # Shape: (batch_size, 4)
+    A_reduced = A[:, :, :3]  # Coefficients of x, y, z
+    # Solve using torch.linalg.lstsq (supports batching)
+    X_xyz = torch.linalg.lstsq(A_reduced, b.unsqueeze(2)).solution.squeeze(2)  # Shape: (batch_size, 3)
+    # Append 1 to get homogeneous coordinates
+    ones = torch.ones((batch_size, 1), dtype=torch.float32, device=X_xyz.device)
+    X = torch.cat([X_xyz, ones], dim=1)  # Shape: (batch_size, 4)
+    # Now compute the errors of projections.
+    seeked_splats_proj1 = (X.unsqueeze(1) @ P1).squeeze(1)
+    seeked_splats_proj1 = seeked_splats_proj1 / (EPS + seeked_splats_proj1[:, [3]])
+    seeked_splats_proj2 = (X.unsqueeze(1) @ P2).squeeze(1)
+    seeked_splats_proj2 = seeked_splats_proj2 / (EPS + seeked_splats_proj2[:, [3]])
+    proj1_target = torch.concat([k1_x, k1_y], dim=1)
+    proj2_target = torch.concat([k2_x, k2_y], dim=1)
+    errors_proj1 = torch.abs(seeked_splats_proj1[:, :2] - proj1_target).sum(1).detach().cpu().numpy()
+    errors_proj2 = torch.abs(seeked_splats_proj2[:, :2] - proj2_target).sum(1).detach().cpu().numpy()
+    return X, errors_proj1, errors_proj2
+def select_best_keypoints(
+        NNs_triangulated_points, NNs_errors_proj1, NNs_errors_proj2, device="cuda"):
+    """
+    From all the points fitted to  keypoints and corresponding colors from the source image (imA) and multiple target images (imB_compound).
+    Args:
+        NNs_triangulated_points:  torch tensor with keypoints coordinates (num_nns, num_points, dim). dim can be arbitrary,
+            usually 3 or 4(for homogeneous representation).
+        NNs_errors_proj1:  numpy array with projection error of the estimated keypoint on the reference frame (num_nns, num_points).
+        NNs_errors_proj2:  numpy array with projection error of the estimated keypoint on the neighbor frame (num_nns, num_points).
+    Returns:
+        selected_keypoints: keypoints with the best score.
+    """
+    NNs_errors_proj = np.maximum(NNs_errors_proj1, NNs_errors_proj2)
+    # Convert indices to PyTorch tensor
+    indices = torch.from_numpy(np.argmin(NNs_errors_proj, axis=0)).long().to(device)
+    # Create index tensor for the second dimension
+    n_indices = torch.arange(NNs_triangulated_points.shape[1]).long().to(device)
+    # Use advanced indexing to select elements
+    NNs_triangulated_points_selected = NNs_triangulated_points[indices, n_indices, :]  # Shape: [N, k]
+    return NNs_triangulated_points_selected, np.min(NNs_errors_proj, axis=0)
+def init_gaussians_with_corr(gaussians, scene, cfg, device, verbose = False, roma_model=None):
+    """
+    For a given input gaussians and a scene we instantiate a RoMa model(change to indoors if necessary) and process scene
+    training frames to extract correspondences. Those are used to initialize gaussians
+    Args:
+        gaussians: object gaussians of the class GaussianModel that we need to enrich with gaussians.
+        scene: object of the Scene class.
+        cfg: configuration. Use init_wC
+    Returns:
+        gaussians: inplace transforms object gaussians of the class GaussianModel.
+    """
+    if roma_model is None:
+        if cfg.roma_model == "indoors":
+            roma_model = roma_indoor(device=device)
+        else:
+            roma_model = roma_outdoor(device=device)
+        roma_model.upsample_preds = False
+        roma_model.symmetric = False
+    M = cfg.matches_per_ref
+    upper_thresh = roma_model.sample_thresh
+    scaling_factor = cfg.scaling_factor
+    expansion_factor = 1
+    keypoint_fit_error_tolerance = cfg.proj_err_tolerance
+    visualizations = {}
+    viewpoint_stack = scene.getTrainCameras().copy()
+    NUM_REFERENCE_FRAMES = min(cfg.num_refs, len(viewpoint_stack))
+    NUM_NNS_PER_REFERENCE = min(cfg.nns_per_ref , len(viewpoint_stack))
+    # Select cameras using K-means
+    viewpoint_cam_all = torch.stack([x.world_view_transform.flatten() for x in viewpoint_stack], axis=0)
+    selected_indices = select_cameras_kmeans(cameras=viewpoint_cam_all.detach().cpu().numpy(), K=NUM_REFERENCE_FRAMES)
+    selected_indices = sorted(selected_indices)
+    # Find the k-closest vectors for each vector
+    viewpoint_cam_all = torch.stack([x.world_view_transform.flatten() for x in viewpoint_stack], axis=0)
+    closest_indices = k_closest_vectors(viewpoint_cam_all, NUM_NNS_PER_REFERENCE)
+    if verbose: print("Indices of k-closest vectors for each vector:\n", closest_indices)
+    closest_indices_selected = closest_indices[:, :].detach().cpu().numpy()
+    all_new_xyz = []
+    all_new_features_dc = []
+    all_new_features_rest = []
+    all_new_opacities = []
+    all_new_scaling = []
+    all_new_rotation = []
+    # Run roma_model.match once to kinda initialize the model
+    with torch.no_grad():
+        viewpoint_cam1 = viewpoint_stack[0]
+        viewpoint_cam2 = viewpoint_stack[1]
+        imA = viewpoint_cam1.original_image.detach().cpu().numpy().transpose(1, 2, 0)
+        imB = viewpoint_cam2.original_image.detach().cpu().numpy().transpose(1, 2, 0)
+        imA = Image.fromarray(np.clip(imA * 255, 0, 255).astype(np.uint8))
+        imB = Image.fromarray(np.clip(imB * 255, 0, 255).astype(np.uint8))
+        warp, certainty_warp = roma_model.match(imA, imB, device=device)
+        print("Once run full roma_model.match warp.shape:", warp.shape)
+        print("Once run full roma_model.match certainty_warp.shape:", certainty_warp.shape)
+        del warp, certainty_warp
+        torch.cuda.empty_cache()
+    for source_idx in tqdm(sorted(selected_indices)):
+        # 1. Compute keypoints and warping for all the neigboring views
+        with torch.no_grad():
+            # Call the aggregation function to get imA and imB_compound
+            certainties_max, warps_max, certainties_max_idcs, imA, imB_compound, certainties_all, warps_all = aggregate_confidences_and_warps(
+                viewpoint_stack=viewpoint_stack,
+                closest_indices=closest_indices_selected,
+                roma_model=roma_model,
+                source_idx=source_idx,
+                verbose=verbose, output_dict=visualizations
+            )
+        # Triangulate keypoints
+        with torch.no_grad():
+            matches = warps_max
+            certainty = certainties_max
+            certainty = certainty.clone()
+            certainty[certainty > upper_thresh] = 1
+            matches, certainty = (
+                matches.reshape(-1, 4),
+                certainty.reshape(-1),
+            )
+            # Select based on certainty elements with high confidence. These are basically all of
+            # kptsA_np.
+            good_samples = torch.multinomial(certainty,
+                                             num_samples=min(expansion_factor * M, len(certainty)),
+                                             replacement=False)
+        certainties_max, warps_max, certainties_max_idcs, imA, imB_compound, certainties_all, warps_all
+        reference_image_dict = {
+            "ref_image": imA,
+            "NNs_images": imB_compound,
+            "certainties_all": certainties_all,
+            "warps_all": warps_all,
+            "triangulated_points": [],
+            "triangulated_points_errors_proj1": [],
+            "triangulated_points_errors_proj2": []
+        }
+        with torch.no_grad():
+            for NN_idx in tqdm(range(len(warps_all))):
+                matches_NN = warps_all[NN_idx].reshape(-1, 4)[good_samples]
+                # Extract keypoints and colors
+                kptsA_np, kptsB_np, kptsB_proj_matrices_idcs, kptsA_color, kptsB_color = extract_keypoints_and_colors(
+                    imA, imB_compound, certainties_max, certainties_max_idcs, matches_NN, roma_model
+                )
+                proj_matrices_A = viewpoint_stack[source_idx].full_proj_transform
+                proj_matrices_B = viewpoint_stack[closest_indices_selected[source_idx, NN_idx]].full_proj_transform
+                triangulated_points, triangulated_points_errors_proj1, triangulated_points_errors_proj2 = triangulate_points(
+                    P1=torch.stack([proj_matrices_A] * M, axis=0),
+                    P2=torch.stack([proj_matrices_B] * M, axis=0),
+                    k1_x=kptsA_np[:M, 0], k1_y=kptsA_np[:M, 1],
+                    k2_x=kptsB_np[:M, 0], k2_y=kptsB_np[:M, 1])
+                reference_image_dict["triangulated_points"].append(triangulated_points)
+                reference_image_dict["triangulated_points_errors_proj1"].append(triangulated_points_errors_proj1)
+                reference_image_dict["triangulated_points_errors_proj2"].append(triangulated_points_errors_proj2)
+        with torch.no_grad():
+            NNs_triangulated_points_selected, NNs_triangulated_points_selected_proj_errors = select_best_keypoints(
+                NNs_triangulated_points=torch.stack(reference_image_dict["triangulated_points"], dim=0),
+                NNs_errors_proj1=np.stack(reference_image_dict["triangulated_points_errors_proj1"], axis=0),
+                NNs_errors_proj2=np.stack(reference_image_dict["triangulated_points_errors_proj2"], axis=0))
+        # 4. Save as gaussians
+        viewpoint_cam1 = viewpoint_stack[source_idx]
+        N = len(NNs_triangulated_points_selected)
+        with torch.no_grad():
+            new_xyz = NNs_triangulated_points_selected[:, :-1]
+            all_new_xyz.append(new_xyz)  # seeked_splats
+            all_new_features_dc.append(RGB2SH(torch.tensor(kptsA_color.astype(np.float32) / 255.)).unsqueeze(1))
+            all_new_features_rest.append(torch.stack([gaussians._features_rest[-1].clone().detach() * 0.] * N, dim=0))
+            # new version that sets points with large error invisible
+            # TODO: remove those points instead. However it doesn't affect the performance.
+            mask_bad_points = torch.tensor(
+                NNs_triangulated_points_selected_proj_errors > keypoint_fit_error_tolerance,
+                dtype=torch.float32).unsqueeze(1).to(device)
+            all_new_opacities.append(torch.stack([gaussians._opacity[-1].clone().detach()] * N, dim=0) * 0. - mask_bad_points * (1e1))
+            dist_points_to_cam1 = torch.linalg.norm(viewpoint_cam1.camera_center.clone().detach() - new_xyz,
+                                                    dim=1, ord=2)
+            #all_new_scaling.append(torch.log(((dist_points_to_cam1) / 1. * scaling_factor).unsqueeze(1).repeat(1, 3)))
+            all_new_scaling.append(gaussians.scaling_inverse_activation((dist_points_to_cam1 * scaling_factor).unsqueeze(1).repeat(1, 3)))
+            all_new_rotation.append(torch.stack([gaussians._rotation[-1].clone().detach()] * N, dim=0))
+    all_new_xyz = torch.cat(all_new_xyz, dim=0)
+    all_new_features_dc = torch.cat(all_new_features_dc, dim=0)
+    new_tmp_radii = torch.zeros(all_new_xyz.shape[0])
+    prune_mask = torch.ones(all_new_xyz.shape[0], dtype=torch.bool)
+    gaussians.densification_postfix(all_new_xyz[prune_mask].to(device),
+                                    all_new_features_dc[prune_mask].to(device),
+                                    torch.cat(all_new_features_rest, dim=0)[prune_mask].to(device),
+                                    torch.cat(all_new_opacities, dim=0)[prune_mask].to(device),
+                                    torch.cat(all_new_scaling, dim=0)[prune_mask].to(device),
+                                    torch.cat(all_new_rotation, dim=0)[prune_mask].to(device),
+                                    new_tmp_radii[prune_mask].to(device))
+    return viewpoint_stack, closest_indices_selected, visualizations
+def extract_keypoints_and_colors_single(imA, imB, matches, roma_model, verbose=False, output_dict={}):
+    """
+    Extracts keypoints and corresponding colors from a source image (imA) and a single target image (imB).
+    Args:
+        imA: Source image as a NumPy array (H_A, W_A, C).
+        imB: Target image as a NumPy array (H_B, W_B, C).
+        matches: Matches in normalized coordinates (torch.Tensor).
+        roma_model: Roma model instance for keypoint operations.
+        verbose: If True, outputs intermediate visualizations.
+    Returns:
+        kptsA_np: Keypoints in imA (normalized).
+        kptsB_np: Keypoints in imB (normalized).
+        kptsA_color: Colors of keypoints in imA.
+        kptsB_color: Colors of keypoints in imB.
+    """
+    H_A, W_A, _ = imA.shape
+    H_B, W_B, _ = imB.shape
+    # Convert matches to pixel coordinates
+    # Matches format: (B, 4) = (x1_norm, y1_norm, x2_norm, y2_norm)
+    kptsA = matches[:, :2]  # [N, 2]
+    kptsB = matches[:, 2:]  # [N, 2]
+    # Scale normalized coordinates [-1,1] to pixel coordinates
+    kptsA_pix = torch.zeros_like(kptsA)
+    kptsB_pix = torch.zeros_like(kptsB)
+    # Important! [Normalized to pixel space]
+    kptsA_pix[:, 0] = (kptsA[:, 0] + 1) * (W_A - 1) / 2
+    kptsA_pix[:, 1] = (kptsA[:, 1] + 1) * (H_A - 1) / 2
+    kptsB_pix[:, 0] = (kptsB[:, 0] + 1) * (W_B - 1) / 2
+    kptsB_pix[:, 1] = (kptsB[:, 1] + 1) * (H_B - 1) / 2
+    kptsA_np = kptsA_pix.detach().cpu().numpy()
+    kptsB_np = kptsB_pix.detach().cpu().numpy()
+    # Extract colors
+    kptsA_x = np.round(kptsA_np[:, 0]).astype(int)
+    kptsA_y = np.round(kptsA_np[:, 1]).astype(int)
+    kptsB_x = np.round(kptsB_np[:, 0]).astype(int)
+    kptsB_y = np.round(kptsB_np[:, 1]).astype(int)
+    kptsA_color = imA[np.clip(kptsA_y, 0, H_A-1), np.clip(kptsA_x, 0, W_A-1)]
+    kptsB_color = imB[np.clip(kptsB_y, 0, H_B-1), np.clip(kptsB_x, 0, W_B-1)]
+    # Normalize keypoints into [-1, 1] for downstream triangulation
+    kptsA_np_norm = np.zeros_like(kptsA_np)
+    kptsB_np_norm = np.zeros_like(kptsB_np)
+    kptsA_np_norm[:, 0] = kptsA_np[:, 0] / (W_A - 1) * 2.0 - 1.0
+    kptsA_np_norm[:, 1] = kptsA_np[:, 1] / (H_A - 1) * 2.0 - 1.0
+    kptsB_np_norm[:, 0] = kptsB_np[:, 0] / (W_B - 1) * 2.0 - 1.0
+    kptsB_np_norm[:, 1] = kptsB_np[:, 1] / (H_B - 1) * 2.0 - 1.0
+    return kptsA_np_norm, kptsB_np_norm, kptsA_color, kptsB_color
+def init_gaussians_with_corr_fast(gaussians, scene, cfg, device, verbose=False, roma_model=None):
+    timings = defaultdict(list)
+    if roma_model is None:
+        if cfg.roma_model == "indoors":
+            roma_model = roma_indoor(device=device)
+        else:
+            roma_model = roma_outdoor(device=device)
+        roma_model.upsample_preds = False
+        roma_model.symmetric = False
+    M = cfg.matches_per_ref
+    upper_thresh = roma_model.sample_thresh
+    scaling_factor = cfg.scaling_factor
+    expansion_factor = 1
+    keypoint_fit_error_tolerance = cfg.proj_err_tolerance
+    visualizations = {}
+    viewpoint_stack = scene.getTrainCameras().copy()
+    NUM_REFERENCE_FRAMES = min(cfg.num_refs, len(viewpoint_stack))
+    NUM_NNS_PER_REFERENCE = 1  # Only ONE neighbor now!
+    viewpoint_cam_all = torch.stack([x.world_view_transform.flatten() for x in viewpoint_stack], axis=0)
+    selected_indices = select_cameras_kmeans(cameras=viewpoint_cam_all.detach().cpu().numpy(), K=NUM_REFERENCE_FRAMES)
+    selected_indices = sorted(selected_indices)
+    viewpoint_cam_all = torch.stack([x.world_view_transform.flatten() for x in viewpoint_stack], axis=0)
+    closest_indices = k_closest_vectors(viewpoint_cam_all, NUM_NNS_PER_REFERENCE)
+    closest_indices_selected = closest_indices[:, :].detach().cpu().numpy()
+    all_new_xyz = []
+    all_new_features_dc = []
+    all_new_features_rest = []
+    all_new_opacities = []
+    all_new_scaling = []
+    all_new_rotation = []
+    # Dummy first pass to initialize model
+    with torch.no_grad():
+        viewpoint_cam1 = viewpoint_stack[0]
+        viewpoint_cam2 = viewpoint_stack[1]
+        imA = viewpoint_cam1.original_image.detach().cpu().numpy().transpose(1, 2, 0)
+        imB = viewpoint_cam2.original_image.detach().cpu().numpy().transpose(1, 2, 0)
+        imA = Image.fromarray(np.clip(imA * 255, 0, 255).astype(np.uint8))
+        imB = Image.fromarray(np.clip(imB * 255, 0, 255).astype(np.uint8))
+        warp, certainty_warp = roma_model.match(imA, imB, device=device)
+        del warp, certainty_warp
+        torch.cuda.empty_cache()
+    # Main Loop over source_idx
+    for source_idx in tqdm(sorted(selected_indices), desc="Profiling source frames"):
+        # =================== Step 1: Compute Warp and Certainty ===================
+        start = time.time()
+        viewpoint_cam1 = viewpoint_stack[source_idx]
+        NNs=closest_indices_selected.shape[1]
+        viewpoint_cam2 = viewpoint_stack[closest_indices_selected[source_idx, np.random.randint(NNs)]]
+        imA = viewpoint_cam1.original_image.detach().cpu().numpy().transpose(1, 2, 0)
+        imB = viewpoint_cam2.original_image.detach().cpu().numpy().transpose(1, 2, 0)
+        imA = Image.fromarray(np.clip(imA * 255, 0, 255).astype(np.uint8))
+        imB = Image.fromarray(np.clip(imB * 255, 0, 255).astype(np.uint8))
+        warp, certainty_warp = roma_model.match(imA, imB, device=device)
+        certainties_max = certainty_warp  # New manual sampling
+        timings['aggregation_warp_certainty'].append(time.time() - start)
+        # =================== Step 2: Good Samples Selection ===================
+        start = time.time()
+        certainty = certainties_max.reshape(-1).clone()
+        certainty[certainty > upper_thresh] = 1
+        good_samples = torch.multinomial(certainty, num_samples=min(expansion_factor * M, len(certainty)), replacement=False)
+        timings['good_samples_selection'].append(time.time() - start)
+        # =================== Step 3: Triangulate Keypoints ===================
+        reference_image_dict = {
+            "triangulated_points": [],
+            "triangulated_points_errors_proj1": [],
+            "triangulated_points_errors_proj2": []
+        }
+        start = time.time()
+        matches_NN = warp.reshape(-1, 4)[good_samples]
+        # Convert matches to pixel coordinates
+        kptsA_np, kptsB_np, kptsA_color, kptsB_color = extract_keypoints_and_colors_single(
+            np.array(imA).astype(np.uint8),
+            np.array(imB).astype(np.uint8),
+            matches_NN,
+            roma_model
+        )
+        proj_matrices_A = viewpoint_stack[source_idx].full_proj_transform
+        proj_matrices_B = viewpoint_stack[closest_indices_selected[source_idx, 0]].full_proj_transform
+        triangulated_points, triangulated_points_errors_proj1, triangulated_points_errors_proj2 = triangulate_points(
+            P1=torch.stack([proj_matrices_A] * M, axis=0),
+            P2=torch.stack([proj_matrices_B] * M, axis=0),
+            k1_x=kptsA_np[:M, 0], k1_y=kptsA_np[:M, 1],
+            k2_x=kptsB_np[:M, 0], k2_y=kptsB_np[:M, 1])
+        reference_image_dict["triangulated_points"].append(triangulated_points)
+        reference_image_dict["triangulated_points_errors_proj1"].append(triangulated_points_errors_proj1)
+        reference_image_dict["triangulated_points_errors_proj2"].append(triangulated_points_errors_proj2)
+        timings['triangulation_per_NN'].append(time.time() - start)
+        # =================== Step 4: Select Best Triangulated Points ===================
+        start = time.time()
+        NNs_triangulated_points_selected, NNs_triangulated_points_selected_proj_errors = select_best_keypoints(
+            NNs_triangulated_points=torch.stack(reference_image_dict["triangulated_points"], dim=0),
+            NNs_errors_proj1=np.stack(reference_image_dict["triangulated_points_errors_proj1"], axis=0),
+            NNs_errors_proj2=np.stack(reference_image_dict["triangulated_points_errors_proj2"], axis=0))
+        timings['select_best_keypoints'].append(time.time() - start)
+        # =================== Step 5: Create New Gaussians ===================
+        start = time.time()
+        viewpoint_cam1 = viewpoint_stack[source_idx]
+        N = len(NNs_triangulated_points_selected)
+        new_xyz = NNs_triangulated_points_selected[:, :-1]
+        all_new_xyz.append(new_xyz)
+        all_new_features_dc.append(RGB2SH(torch.tensor(kptsA_color.astype(np.float32) / 255.)).unsqueeze(1))
+        all_new_features_rest.append(torch.stack([gaussians._features_rest[-1].clone().detach() * 0.] * N, dim=0))
+        mask_bad_points = torch.tensor(
+            NNs_triangulated_points_selected_proj_errors > keypoint_fit_error_tolerance,
+            dtype=torch.float32).unsqueeze(1).to(device)
+        all_new_opacities.append(torch.stack([gaussians._opacity[-1].clone().detach()] * N, dim=0) * 0. - mask_bad_points * (1e1))
+        dist_points_to_cam1 = torch.linalg.norm(viewpoint_cam1.camera_center.clone().detach() - new_xyz, dim=1, ord=2)
+        all_new_scaling.append(gaussians.scaling_inverse_activation((dist_points_to_cam1 * scaling_factor).unsqueeze(1).repeat(1, 3)))
+        all_new_rotation.append(torch.stack([gaussians._rotation[-1].clone().detach()] * N, dim=0))
+        timings['save_gaussians'].append(time.time() - start)
+    # =================== Final Densification Postfix ===================
+    start = time.time()
+    all_new_xyz = torch.cat(all_new_xyz, dim=0)
+    all_new_features_dc = torch.cat(all_new_features_dc, dim=0)
+    new_tmp_radii = torch.zeros(all_new_xyz.shape[0])
+    prune_mask = torch.ones(all_new_xyz.shape[0], dtype=torch.bool)
+    gaussians.densification_postfix(
+        all_new_xyz[prune_mask].to(device),
+        all_new_features_dc[prune_mask].to(device),
+        torch.cat(all_new_features_rest, dim=0)[prune_mask].to(device),
+        torch.cat(all_new_opacities, dim=0)[prune_mask].to(device),
+        torch.cat(all_new_scaling, dim=0)[prune_mask].to(device),
+        torch.cat(all_new_rotation, dim=0)[prune_mask].to(device),
+        new_tmp_radii[prune_mask].to(device)
+    )
+    timings['final_densification_postfix'].append(time.time() - start)
+    # =================== Print Profiling Results ===================
+    print("\n=== Profiling Summary (average per frame) ===")
+    for key, times in timings.items():
+        print(f"{key:35s}: {sum(times) / len(times):.4f} sec (total {sum(times):.2f} sec)")
+    return viewpoint_stack, closest_indices_selected, visualizations

source/data_utils.py ADDED Viewed

	@@ -0,0 +1,28 @@

+def scene_cameras_train_test_split(scene, verbose=False):
+    """
+    Iterate over resolutions in the scene. For each resolution check if this resolution has test_cameras
+    if it doesn't then extract every 8th camera from the train and put it to the test set. This follows the
+    evaluation protocol suggested by Kerbl et al. in the seminal work on 3DGS. All changes to the input
+    object scene are inplace changes.
+    :param scene: Scene Class object from the gaussian-splatting.scene module
+    :param verbose: Print initial and final stage of the function
+    :return:  None
+    """
+    if verbose: print("Preparing train and test sets split...")
+    for resolution in scene.train_cameras.keys():
+        if len(scene.test_cameras[resolution]) == 0:
+            if verbose:
+                print(f"Found no test_cameras for resolution {resolution}. Move every 8th camera out ouf total "+\
+                      f"{len(scene.train_cameras[resolution])} train cameras to the test set now")
+            N = len(scene.train_cameras[resolution])
+            scene.test_cameras[resolution] = [scene.train_cameras[resolution][idx] for idx in range(0, N)
+                                              if idx % 8 == 0]
+            scene.train_cameras[resolution] = [scene.train_cameras[resolution][idx] for idx in range(0, N)
+                                               if idx % 8 != 0]
+            if verbose:
+                print(f"Done. Now train and test sets contain each {len(scene.train_cameras[resolution])} and " + \
+                      f"{len(scene.test_cameras[resolution])} cameras respectively.")
+    return

source/losses.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Code is copied from the gaussian-splatting/utils/loss_utils.py
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+from math import exp
+def l1_loss(network_output, gt, mean=True):
+    return torch.abs((network_output - gt)).mean() if mean else torch.abs((network_output - gt))
+def l2_loss(network_output, gt):
+    return ((network_output - gt) ** 2).mean()
+def gaussian(window_size, sigma):
+    gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)])
+    return gauss / gauss.sum()
+def create_window(window_size, channel):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
+    window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
+    return window
+def ssim(img1, img2, window_size=11, size_average=True, mask = None):
+    channel = img1.size(-3)
+    window = create_window(window_size, channel)
+    if img1.is_cuda:
+        window = window.cuda(img1.get_device())
+    window = window.type_as(img1)
+    return _ssim(img1, img2, window, window_size, channel, size_average, mask)
+def _ssim(img1, img2, window, window_size, channel, size_average=True, mask = None):
+    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
+    sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
+    sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
+    C1 = 0.01 ** 2
+    C2 = 0.03 ** 2
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+    if mask is not None:
+        ssim_map = ssim_map * mask
+    if size_average:
+        return ssim_map.mean()
+    else:
+        return ssim_map.mean(1).mean(1).mean(1)
+def mse(img1, img2):
+    return (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)
+def psnr(img1, img2):
+    """
+    Computes the Peak Signal-to-Noise Ratio (PSNR) between two single images. NOT BATCHED!
+    Args:
+        img1 (torch.Tensor): The first image tensor, with pixel values scaled between 0 and 1.
+                             Shape should be (channels, height, width).
+        img2 (torch.Tensor): The second image tensor with the same shape as img1, used for comparison.
+    Returns:
+        torch.Tensor: A scalar tensor containing the PSNR value in decibels (dB).
+    """
+    mse = (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)
+    return 20 * torch.log10(1.0 / torch.sqrt(mse))
+def tv_loss(image):
+    """
+    Computes the total variation (TV) loss for an image of shape [3, H, W].
+    Args:
+        image (torch.Tensor): Input image of shape [3, H, W]
+    Returns:
+        torch.Tensor: Scalar value representing the total variation loss.
+    """
+    # Ensure the image has the correct dimensions
+    assert image.ndim == 3 and image.shape[0] == 3, "Input must be of shape [3, H, W]"
+    # Compute the difference between adjacent pixels in the x-direction (width)
+    diff_x = torch.abs(image[:, :, 1:] - image[:, :, :-1])
+    # Compute the difference between adjacent pixels in the y-direction (height)
+    diff_y = torch.abs(image[:, 1:, :] - image[:, :-1, :])
+    # Sum the total variation in both directions
+    tv_loss_value = torch.mean(diff_x) + torch.mean(diff_y)
+    return tv_loss_value

source/networks.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+import sys
+sys.path.append('./submodules/gaussian-splatting/')
+from random import randint
+from scene import Scene, GaussianModel
+from gaussian_renderer import render
+from source.data_utils import scene_cameras_train_test_split
+class Warper3DGS(torch.nn.Module):
+    def __init__(self, sh_degree,  opt, pipe, dataset, viewpoint_stack, verbose,
+                 do_train_test_split=True):
+        super(Warper3DGS, self).__init__()
+        """
+        Init Warper using all the objects necessary for rendering gaussian splats.
+        Here we merely link class objects to the objects instantiated outsided the class.
+        """
+        self.gaussians = GaussianModel(sh_degree)
+        self.gaussians.tmp_radii = torch.zeros((self.gaussians.get_xyz.shape[0]), device="cuda")
+        self.render = render
+        self.gs_config_opt = opt
+        bg_color = [1, 1, 1] if dataset.white_background else [0, 0, 0]
+        self.bg = torch.tensor(bg_color, dtype=torch.float32, device="cuda")
+        self.pipe = pipe
+        self.scene = Scene(dataset, self.gaussians, shuffle=False)
+        if do_train_test_split:
+            scene_cameras_train_test_split(self.scene, verbose=verbose)
+        self.gaussians.training_setup(opt)
+        self.viewpoint_stack = viewpoint_stack
+        if not self.viewpoint_stack:
+            self.viewpoint_stack = self.scene.getTrainCameras().copy()
+    def forward(self, viewpoint_cam=None):
+        """
+        For a provided camera viewpoint_cam we render gaussians from this viewpoint.
+        If no camera provided then we use the self.viewpoint_stack (list of cameras).
+        If the latter is empty we reinitialize it using the self.scene object.
+        """
+        if not viewpoint_cam:
+            if not self.viewpoint_stack:
+                self.viewpoint_stack = self.scene.getTrainCameras().copy()
+            viewpoint_cam = self.viewpoint_stack[randint(0, len(self.viewpoint_stack) - 1)]
+        render_pkg = self.render(viewpoint_cam, self.gaussians, self.pipe, self.bg)
+        return render_pkg

source/timer.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import time
+class Timer:
+    def __init__(self):
+        self.start_time = None
+        self.elapsed = 0
+        self.paused = False
+    def start(self):
+        if self.start_time is None:
+            self.start_time = time.time()
+        elif self.paused:
+            self.start_time = time.time() - self.elapsed
+            self.paused = False
+    def pause(self):
+        if not self.paused:
+            self.elapsed = time.time() - self.start_time
+            self.paused = True
+    def get_elapsed_time(self):
+        if self.paused:
+            return self.elapsed
+        else:
+            return time.time() - self.start_time

source/trainer.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import torch
+from random import randint
+from tqdm.rich import trange
+from tqdm import tqdm as tqdm
+from source.networks import Warper3DGS
+import wandb
+import sys
+sys.path.append('./submodules/gaussian-splatting/')
+import lpips
+from source.losses import ssim, l1_loss, psnr
+from rich.console import Console
+from rich.theme import Theme
+custom_theme = Theme({
+    "info": "dim cyan",
+    "warning": "magenta",
+    "danger": "bold red"
+})
+from source.corr_init import init_gaussians_with_corr, init_gaussians_with_corr_fast
+from source.utils_aux import log_samples
+from source.timer import Timer
+class EDGSTrainer:
+    def __init__(self,
+                 GS: Warper3DGS,
+                 training_config,
+                 dataset_white_background=False,
+                 device=torch.device('cuda'),
+                 log_wandb=True,
+                 ):
+        self.GS = GS
+        self.scene = GS.scene
+        self.viewpoint_stack = GS.viewpoint_stack
+        self.gaussians = GS.gaussians
+        self.training_config = training_config
+        self.GS_optimizer = GS.gaussians.optimizer
+        self.dataset_white_background = dataset_white_background
+        self.training_step = 1
+        self.gs_step = 0
+        self.CONSOLE = Console(width=120, theme=custom_theme)
+        self.saving_iterations = training_config.save_iterations
+        self.evaluate_iterations = None
+        self.batch_size = training_config.batch_size
+        self.ema_loss_for_log = 0.0
+        # Logs in the format {step:{"loss1":loss1_value, "loss2":loss2_value}}
+        self.logs_losses = {}
+        self.lpips = lpips.LPIPS(net='vgg').to(device)
+        self.device = device
+        self.timer = Timer()
+        self.log_wandb = log_wandb
+    def load_checkpoints(self, load_cfg):
+        # Load 3DGS checkpoint
+        if load_cfg.gs:
+            self.gs.gaussians.restore(
+                torch.load(f"{load_cfg.gs}/chkpnt{load_cfg.gs_step}.pth")[0],
+                self.training_config)
+            self.GS_optimizer = self.GS.gaussians.optimizer
+            self.CONSOLE.print(f"3DGS loaded from checkpoint for iteration {load_cfg.gs_step}",
+                               style="info")
+            self.training_step += load_cfg.gs_step
+            self.gs_step += load_cfg.gs_step
+    def train(self, train_cfg):
+        # 3DGS training
+        self.CONSOLE.print("Train 3DGS for {} iterations".format(train_cfg.gs_epochs), style="info")
+        with trange(self.training_step, self.training_step + train_cfg.gs_epochs, desc="[green]Train gaussians") as progress_bar:
+            for self.training_step in progress_bar:
+                radii = self.train_step_gs(max_lr=train_cfg.max_lr, no_densify=train_cfg.no_densify)
+                with torch.no_grad():
+                    if train_cfg.no_densify:
+                        self.prune(radii)
+                    else:
+                        self.densify_and_prune(radii)
+                    if train_cfg.reduce_opacity:
+                        # Slightly reduce opacity every few steps:
+                        if self.gs_step < self.training_config.densify_until_iter and self.gs_step % 10 == 0:
+                            opacities_new = torch.log(torch.exp(self.GS.gaussians._opacity.data) * 0.99)
+                            self.GS.gaussians._opacity.data = opacities_new
+                    self.timer.pause()
+                    # Progress bar
+                    if self.training_step % 10 == 0:
+                        progress_bar.set_postfix({"[red]Loss": f"{self.ema_loss_for_log:.{7}f}"}, refresh=True)
+                    # Log and save
+                    if self.training_step in self.saving_iterations:
+                        self.save_model()
+                    if self.evaluate_iterations is not None:
+                        if self.training_step in self.evaluate_iterations:
+                            self.evaluate()
+                    else:
+                        if (self.training_step <= 3000 and self.training_step % 500 == 0) or \
+                            (self.training_step > 3000 and self.training_step % 1000 == 228) :
+                            self.evaluate()
+                    self.timer.start()
+    def evaluate(self):
+        torch.cuda.empty_cache()
+        log_gen_images, log_real_images = [], []
+        validation_configs = ({'name': 'test', 'cameras': self.scene.getTestCameras(), 'cam_idx': self.training_config.TEST_CAM_IDX_TO_LOG},
+                              {'name': 'train',
+                               'cameras': [self.scene.getTrainCameras()[idx % len(self.scene.getTrainCameras())] for idx in
+                                           range(0, 150, 5)], 'cam_idx': 10})
+        if self.log_wandb:
+            wandb.log({f"Number of Gaussians": len(self.GS.gaussians._xyz)}, step=self.training_step)
+        for config in validation_configs:
+            if config['cameras'] and len(config['cameras']) > 0:
+                l1_test = 0.0
+                psnr_test = 0.0
+                ssim_test = 0.0
+                lpips_splat_test = 0.0
+                for idx, viewpoint in enumerate(config['cameras']):
+                    image = torch.clamp(self.GS(viewpoint)["render"], 0.0, 1.0)
+                    gt_image = torch.clamp(viewpoint.original_image.to(self.device), 0.0, 1.0)
+                    l1_test += l1_loss(image, gt_image).double()
+                    psnr_test += psnr(image.unsqueeze(0), gt_image.unsqueeze(0)).double()
+                    ssim_test += ssim(image, gt_image).double()
+                    lpips_splat_test += self.lpips(image, gt_image).detach().double()
+                    if idx in [config['cam_idx']]:
+                        log_gen_images.append(image)
+                        log_real_images.append(gt_image)
+                psnr_test /= len(config['cameras'])
+                l1_test /= len(config['cameras'])
+                ssim_test /= len(config['cameras'])
+                lpips_splat_test /= len(config['cameras'])
+                if self.log_wandb:
+                    wandb.log({f"{config['name']}/L1": l1_test.item(), f"{config['name']}/PSNR": psnr_test.item(), \
+                            f"{config['name']}/SSIM": ssim_test.item(), f"{config['name']}/LPIPS_splat": lpips_splat_test.item()}, step = self.training_step)
+                self.CONSOLE.print("\n[ITER {}], #{} gaussians, Evaluating {}: L1={:.6f},  PSNR={:.6f}, SSIM={:.6f}, LPIPS_splat={:.6f} ".format(
+                    self.training_step, len(self.GS.gaussians._xyz), config['name'], l1_test.item(), psnr_test.item(), ssim_test.item(), lpips_splat_test.item()), style="info")
+        if self.log_wandb:
+            with torch.no_grad():
+                log_samples(torch.stack((log_real_images[0],log_gen_images[0])) , [], self.training_step, caption="Real and Generated Samples")
+                wandb.log({"time": self.timer.get_elapsed_time()}, step=self.training_step)
+        torch.cuda.empty_cache()
+    def train_step_gs(self, max_lr = False, no_densify = False):
+        self.gs_step += 1
+        if max_lr:
+            self.GS.gaussians.update_learning_rate(max(self.gs_step, 8_000))
+        else:
+            self.GS.gaussians.update_learning_rate(self.gs_step)
+        # Every 1000 its we increase the levels of SH up to a maximum degree
+        if self.gs_step % 1000 == 0:
+            self.GS.gaussians.oneupSHdegree()
+        # Pick a random Camera
+        if not self.viewpoint_stack:
+            self.viewpoint_stack = self.scene.getTrainCameras().copy()
+        viewpoint_cam = self.viewpoint_stack.pop(randint(0, len(self.viewpoint_stack) - 1))
+        render_pkg = self.GS(viewpoint_cam=viewpoint_cam)
+        image = render_pkg["render"]
+        # Loss
+        gt_image = viewpoint_cam.original_image.to(self.device)
+        L1_loss = l1_loss(image, gt_image)
+        ssim_loss = (1.0 - ssim(image, gt_image))
+        loss = (1.0 - self.training_config.lambda_dssim) * L1_loss + \
+               self.training_config.lambda_dssim * ssim_loss
+        self.timer.pause()
+        self.logs_losses[self.training_step] = {"loss": loss.item(),
+                                                "L1_loss": L1_loss.item(),
+                                                "ssim_loss": ssim_loss.item()}
+        if self.log_wandb:
+            for k, v in self.logs_losses[self.training_step].items():
+                wandb.log({f"train/{k}": v}, step=self.training_step)
+        self.ema_loss_for_log = 0.4 * self.logs_losses[self.training_step]["loss"] + 0.6 * self.ema_loss_for_log
+        self.timer.start()
+        self.GS_optimizer.zero_grad(set_to_none=True)
+        loss.backward()
+        with torch.no_grad():
+            if self.gs_step < self.training_config.densify_until_iter and not no_densify:
+                self.GS.gaussians.max_radii2D[render_pkg["visibility_filter"]] = torch.max(
+                    self.GS.gaussians.max_radii2D[render_pkg["visibility_filter"]],
+                    render_pkg["radii"][render_pkg["visibility_filter"]])
+                self.GS.gaussians.add_densification_stats(render_pkg["viewspace_points"],
+                                                                     render_pkg["visibility_filter"])
+        # Optimizer step
+        self.GS_optimizer.step()
+        self.GS_optimizer.zero_grad(set_to_none=True)
+        return render_pkg["radii"]
+    def densify_and_prune(self, radii = None):
+        # Densification or pruning
+        if self.gs_step < self.training_config.densify_until_iter:
+            if (self.gs_step > self.training_config.densify_from_iter) and \
+                    (self.gs_step % self.training_config.densification_interval == 0):
+                size_threshold = 20 if self.gs_step > self.training_config.opacity_reset_interval else None
+                self.GS.gaussians.densify_and_prune(self.training_config.densify_grad_threshold,
+                                                               0.005,
+                                                               self.GS.scene.cameras_extent,
+                                                               size_threshold, radii)
+            if self.gs_step % self.training_config.opacity_reset_interval == 0 or (
+                    self.dataset_white_background and self.gs_step == self.training_config.densify_from_iter):
+                self.GS.gaussians.reset_opacity()
+    def save_model(self):
+        print("\n[ITER {}] Saving Gaussians".format(self.gs_step))
+        self.scene.save(self.gs_step)
+        print("\n[ITER {}] Saving Checkpoint".format(self.gs_step))
+        torch.save((self.GS.gaussians.capture(), self.gs_step),
+                self.scene.model_path + "/chkpnt" + str(self.gs_step) + ".pth")
+    def init_with_corr(self, cfg, verbose=False, roma_model=None):
+        """
+        Initializes image with matchings. Also removes SfM init points.
+        Args:
+            cfg: configuration part named init_wC. Check train.yaml
+            verbose: whether you want to print intermediate results. Useful for debug.
+            roma_model: optionally you can pass here preinit RoMA model to avoid reinit
+                it every time.
+        """
+        if not cfg.use:
+            return None
+        N_splats_at_init = len(self.GS.gaussians._xyz)
+        print("N_splats_at_init:", N_splats_at_init)
+        if cfg.nns_per_ref == 1:
+            init_fn = init_gaussians_with_corr_fast
+        else:
+            init_fn = init_gaussians_with_corr
+        camera_set, selected_indices, visualization_dict = init_fn(
+            self.GS.gaussians,
+            self.scene,
+            cfg,
+            self.device,
+            verbose=verbose,
+            roma_model=roma_model)
+        # Remove SfM points and leave only matchings inits
+        if not cfg.add_SfM_init:
+            with torch.no_grad():
+                N_splats_after_init = len(self.GS.gaussians._xyz)
+                print("N_splats_after_init:", N_splats_after_init)
+                self.gaussians.tmp_radii = torch.zeros(self.gaussians._xyz.shape[0]).to(self.device)
+                mask = torch.concat([torch.ones(N_splats_at_init, dtype=torch.bool),
+                                    torch.zeros(N_splats_after_init-N_splats_at_init, dtype=torch.bool)],
+                                axis=0)
+                self.GS.gaussians.prune_points(mask)
+        with torch.no_grad():
+            gaussians =  self.gaussians
+            gaussians._scaling =  gaussians.scaling_inverse_activation(gaussians.scaling_activation(gaussians._scaling)*0.5)
+        return visualization_dict
+    def prune(self, radii, min_opacity=0.005):
+        self.GS.gaussians.tmp_radii = radii
+        if self.gs_step < self.training_config.densify_until_iter:
+            prune_mask = (self.GS.gaussians.get_opacity < min_opacity).squeeze()
+            self.GS.gaussians.prune_points(prune_mask)
+            torch.cuda.empty_cache()
+        self.GS.gaussians.tmp_radii = None

source/utils_aux.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Perlin noise code taken from https://gist.github.com/adefossez/0646dbe9ed4005480a2407c62aac8869
+from types import SimpleNamespace
+import random
+import numpy as np
+import torch
+import torchvision
+import wandb
+import random
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+import torch
+from PIL import Image
+def parse_dict_to_namespace(dict_nested):
+    """Turns nested dictionary into nested namespaces"""
+    if type(dict_nested) != dict and type(dict_nested) != list: return dict_nested
+    x = SimpleNamespace()
+    for key, val in dict_nested.items():
+        if type(val) == dict:
+            setattr(x, key, parse_dict_to_namespace(val))
+        elif type(val) == list:
+            setattr(x, key, [parse_dict_to_namespace(v) for v in val])
+        else:
+            setattr(x, key, val)
+    return x
+def set_seed(seed=42, cuda=True):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if cuda:
+        torch.cuda.manual_seed_all(seed)
+def log_samples(samples, scores, iteration, caption="Real Samples"):
+    # Create a grid of images
+    grid = torchvision.utils.make_grid(samples)
+    # Log the images and scores to wandb
+    wandb.log({
+        f"{caption}_images": [wandb.Image(grid, caption=f"{caption}: {scores}")],
+    }, step = iteration)
+def pairwise_distances(matrix):
+    """
+    Computes the pairwise Euclidean distances between all vectors in the input matrix.
+    Args:
+        matrix (torch.Tensor): Input matrix of shape [N, D], where N is the number of vectors and D is the dimensionality.
+    Returns:
+        torch.Tensor: Pairwise distance matrix of shape [N, N].
+    """
+    # Compute squared pairwise distances
+    squared_diff = torch.cdist(matrix, matrix, p=2)
+    return squared_diff
+def k_closest_vectors(matrix, k):
+    """
+    Finds the k-closest vectors for each vector in the input matrix based on Euclidean distance.
+    Args:
+        matrix (torch.Tensor): Input matrix of shape [N, D], where N is the number of vectors and D is the dimensionality.
+        k (int): Number of closest vectors to return for each vector.
+    Returns:
+        torch.Tensor: Indices of the k-closest vectors for each vector, excluding the vector itself.
+    """
+    # Compute pairwise distances
+    distances = pairwise_distances(matrix)
+    # For each vector, sort distances and get the indices of the k-closest vectors (excluding itself)
+    # Set diagonal distances to infinity to exclude the vector itself from the nearest neighbors
+    distances.fill_diagonal_(float('inf'))
+    # Get the indices of the k smallest distances (k-closest vectors)
+    _, indices = torch.topk(distances, k, largest=False, dim=1)
+    return indices
+def process_image(image_tensor):
+    image_np = image_tensor.detach().cpu().numpy().transpose(1, 2, 0)
+    return Image.fromarray(np.clip(image_np * 255, 0, 255).astype(np.uint8))
+def normalize_keypoints(kpts_np, width, height):
+    kpts_np[:, 0] = kpts_np[:, 0] / width * 2. - 1.
+    kpts_np[:, 1] = kpts_np[:, 1] / height * 2. - 1.
+    return kpts_np

source/utils_preprocess.py ADDED Viewed

	@@ -0,0 +1,334 @@

+# This file contains function for video or image collection preprocessing.
+# For video we do the preprocessing and select k sharpest frames.
+# Afterwards scene is constructed
+import cv2
+import numpy as np
+from tqdm import tqdm
+import pycolmap
+import os
+import time
+import tempfile
+from moviepy import VideoFileClip
+from matplotlib import pyplot as plt
+from PIL import Image
+import cv2
+from tqdm import tqdm
+WORKDIR = "../outputs/"
+def get_rotation_moviepy(video_path):
+    clip = VideoFileClip(video_path)
+    rotation = 0
+    try:
+        displaymatrix = clip.reader.infos['inputs'][0]['streams'][2]['metadata'].get('displaymatrix', '')
+        if 'rotation of' in displaymatrix:
+            angle = float(displaymatrix.strip().split('rotation of')[-1].split('degrees')[0])
+            rotation = int(angle) % 360
+    except Exception as e:
+        print(f"No displaymatrix rotation found: {e}")
+    clip.reader.close()
+    #if clip.audio:
+    #    clip.audio.reader.close_proc()
+    return rotation
+def resize_max_side(frame, max_size):
+    h, w = frame.shape[:2]
+    scale = max_size / max(h, w)
+    if scale < 1:
+        frame = cv2.resize(frame, (int(w * scale), int(h * scale)))
+    return frame
+def read_video_frames(video_input, k=1, max_size=1024):
+    """
+    Extracts every k-th frame from a video or list of images, resizes to max size, and returns frames as list.
+    Parameters:
+        video_input (str, file-like, or list): Path to video file, file-like object, or list of image files.
+        k (int): Interval for frame extraction (every k-th frame).
+        max_size (int): Maximum size for width or height after resizing.
+    Returns:
+        frames (list): List of resized frames (numpy arrays).
+    """
+    # Handle list of image files (not single video in a list)
+    if isinstance(video_input, list):
+        # If it's a single video in a list, treat it as video
+        if len(video_input) == 1 and video_input[0].name.endswith(('.mp4', '.avi', '.mov')):
+            video_input = video_input[0]  # unwrap single video file
+        else:
+            # Treat as list of images
+            frames = []
+            for img_file in video_input:
+                img = Image.open(img_file.name).convert("RGB")
+                img.thumbnail((max_size, max_size))
+                frames.append(np.array(img)[...,::-1])
+            return frames
+    # Handle file-like or path
+    if hasattr(video_input, 'name'):
+        video_path = video_input.name
+    elif isinstance(video_input, (str, os.PathLike)):
+        video_path = str(video_input)
+    else:
+        raise ValueError("Unsupported video input type. Must be a filepath, file-like object, or list of images.")
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"Error: Could not open video {video_path}.")
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frame_count = 0
+    frames = []
+    with tqdm(total=total_frames // k, desc="Processing Video", unit="frame") as pbar:
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            if frame_count % k == 0:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                h, w = frame.shape[:2]
+                scale = max(h, w) / max_size
+                if scale > 1:
+                    frame = cv2.resize(frame, (int(w / scale), int(h / scale)))
+                frames.append(frame[...,[2,1,0]])
+                pbar.update(1)
+            frame_count += 1
+    cap.release()
+    return frames
+def resize_max_side(frame, max_size):
+    """
+    Resizes the frame so that its largest side equals max_size, maintaining aspect ratio.
+    """
+    height, width = frame.shape[:2]
+    max_dim = max(height, width)
+    if max_dim <= max_size:
+        return frame  # No need to resize
+    scale = max_size / max_dim
+    new_width = int(width * scale)
+    new_height = int(height * scale)
+    resized_frame = cv2.resize(frame, (new_width, new_height), interpolation=cv2.INTER_AREA)
+    return resized_frame
+def variance_of_laplacian(image):
+	# compute the Laplacian of the image and then return the focus
+	# measure, which is simply the variance of the Laplacian
+	return cv2.Laplacian(image, cv2.CV_64F).var()
+def process_all_frames(IMG_FOLDER = '/scratch/datasets/hq_data/night2_all_frames',
+                       to_visualize=False,
+                       save_images=True):
+    dict_scores = {}
+    for idx, img_name in tqdm(enumerate(sorted([x for x in os.listdir(IMG_FOLDER) if '.png' in x]))):
+        img = cv2.imread(os.path.join(IMG_FOLDER, img_name))#[250:, 100:]
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        fm = variance_of_laplacian(gray) + \
+                variance_of_laplacian(cv2.resize(gray, (0,0), fx=0.75, fy=0.75)) + \
+                variance_of_laplacian(cv2.resize(gray, (0,0), fx=0.5, fy=0.5)) + \
+                variance_of_laplacian(cv2.resize(gray, (0,0), fx=0.25, fy=0.25))
+        if to_visualize:
+            plt.figure()
+            plt.title(f"Laplacian score: {fm:.2f}")
+            plt.imshow(img[..., [2,1,0]])
+            plt.show()
+        dict_scores[idx] = {"idx" : idx,
+                            "img_name" : img_name,
+                            "score" : fm}
+        if save_images:
+            dict_scores[idx]["img"] = img
+    return dict_scores
+def select_optimal_frames(scores, k):
+    """
+    Selects a minimal subset of frames while ensuring no gaps exceed k.
+    Args:
+        scores (list of float): List of scores where index represents frame number.
+        k (int): Maximum allowed gap between selected frames.
+    Returns:
+        list of int: Indices of selected frames.
+    """
+    n = len(scores)
+    selected = [0, n-1]
+    i = 0  # Start at the first frame
+    while i < n:
+        # Find the best frame to select within the next k frames
+        best_idx = max(range(i, min(i + k + 1, n)), key=lambda x: scores[x], default=None)
+        if best_idx is None:
+            break  # No more frames left
+        selected.append(best_idx)
+        i = best_idx + k + 1  # Move forward, ensuring gaps stay within k
+    return sorted(selected)
+def variance_of_laplacian(image):
+    """
+    Compute the variance of Laplacian as a focus measure.
+    """
+    return cv2.Laplacian(image, cv2.CV_64F).var()
+def preprocess_frames(frames, verbose=False):
+    """
+    Compute sharpness scores for a list of frames using multi-scale Laplacian variance.
+    Args:
+        frames (list of np.ndarray): List of frames (BGR images).
+        verbose (bool): If True, print scores.
+    Returns:
+        list of float: Sharpness scores for each frame.
+    """
+    scores = []
+    for idx, frame in enumerate(tqdm(frames, desc="Scoring frames")):
+        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        fm = (
+            variance_of_laplacian(gray) +
+            variance_of_laplacian(cv2.resize(gray, (0, 0), fx=0.75, fy=0.75)) +
+            variance_of_laplacian(cv2.resize(gray, (0, 0), fx=0.5, fy=0.5)) +
+            variance_of_laplacian(cv2.resize(gray, (0, 0), fx=0.25, fy=0.25))
+        )
+        if verbose:
+            print(f"Frame {idx}: Sharpness Score = {fm:.2f}")
+        scores.append(fm)
+    return scores
+def select_optimal_frames(scores, k):
+    """
+    Selects k frames by splitting into k segments and picking the sharpest frame from each.
+    Args:
+        scores (list of float): List of sharpness scores.
+        k (int): Number of frames to select.
+    Returns:
+        list of int: Indices of selected frames.
+    """
+    n = len(scores)
+    selected_indices = []
+    segment_size = n // k
+    for i in range(k):
+        start = i * segment_size
+        end = (i + 1) * segment_size if i < k - 1 else n  # Last chunk may be larger
+        segment_scores = scores[start:end]
+        if len(segment_scores) == 0:
+            continue  # Safety check if some segment is empty
+        best_in_segment = start + np.argmax(segment_scores)
+        selected_indices.append(best_in_segment)
+    return sorted(selected_indices)
+def save_frames_to_scene_dir(frames, scene_dir):
+    """
+    Saves a list of frames into the target scene directory under 'images/' subfolder.
+    Args:
+        frames (list of np.ndarray): List of frames (BGR images) to save.
+        scene_dir (str): Target path where 'images/' subfolder will be created.
+    """
+    images_dir = os.path.join(scene_dir, "images")
+    os.makedirs(images_dir, exist_ok=True)
+    for idx, frame in enumerate(frames):
+        filename = os.path.join(images_dir, f"{idx:08d}.png")  # 00000000.png, 00000001.png, etc.
+        cv2.imwrite(filename, frame)
+    print(f"Saved {len(frames)} frames to {images_dir}")
+def run_colmap_on_scene(scene_dir):
+    """
+    Runs feature extraction, matching, and mapping on all images inside scene_dir/images using pycolmap.
+    Args:
+        scene_dir (str): Path to scene directory containing 'images' folder.
+    TODO: if the function hasn't managed to match all the frames either increase image size,
+    increase number of features or just remove those frames from the folder scene_dir/images
+    """
+    start_time = time.time()
+    print(f"Running COLMAP pipeline on all images inside {scene_dir}")
+    # Setup paths
+    database_path = os.path.join(scene_dir, "database.db")
+    sparse_path = os.path.join(scene_dir, "sparse")
+    image_dir = os.path.join(scene_dir, "images")
+    # Make sure output directories exist
+    os.makedirs(sparse_path, exist_ok=True)
+    # Step 1: Feature Extraction
+    pycolmap.extract_features(
+        database_path,
+        image_dir,
+        sift_options={
+            "max_num_features": 512 * 2,
+            "max_image_size": 512 * 1,
+        }
+    )
+    print(f"Finished feature extraction in {(time.time() - start_time):.2f}s.")
+    # Step 2: Feature Matching
+    pycolmap.match_exhaustive(database_path)
+    print(f"Finished feature matching in {(time.time() - start_time):.2f}s.")
+    # Step 3: Mapping
+    pipeline_options = pycolmap.IncrementalPipelineOptions()
+    pipeline_options.min_num_matches = 15
+    pipeline_options.multiple_models = True
+    pipeline_options.max_num_models = 50
+    pipeline_options.max_model_overlap = 20
+    pipeline_options.min_model_size = 10
+    pipeline_options.extract_colors = True
+    pipeline_options.num_threads = 8
+    pipeline_options.mapper.init_min_num_inliers = 30
+    pipeline_options.mapper.init_max_error = 8.0
+    pipeline_options.mapper.init_min_tri_angle = 5.0
+    reconstruction = pycolmap.incremental_mapping(
+        database_path=database_path,
+        image_path=image_dir,
+        output_path=sparse_path,
+        options=pipeline_options,
+    )
+    print(f"Finished incremental mapping in {(time.time() - start_time):.2f}s.")
+    # Step 4: Post-process Cameras to SIMPLE_PINHOLE
+    recon_path = os.path.join(sparse_path, "0")
+    reconstruction = pycolmap.Reconstruction(recon_path)
+    for cam in reconstruction.cameras.values():
+        cam.model = 'SIMPLE_PINHOLE'
+        cam.params = cam.params[:3]  # Keep only [f, cx, cy]
+    reconstruction.write(recon_path)
+    print(f"Total pipeline time: {(time.time() - start_time):.2f}s.")

source/visualization.py ADDED Viewed

	@@ -0,0 +1,1072 @@

+from matplotlib import pyplot as plt
+import numpy as np
+import torch
+import numpy as np
+from typing import List
+import sys
+sys.path.append('./submodules/gaussian-splatting/')
+from scene.cameras import Camera
+from PIL import Image
+import imageio
+from scipy.interpolate import splprep, splev
+import cv2
+import numpy as np
+import plotly.graph_objects as go
+import numpy as np
+from scipy.spatial.transform import Rotation as R, Slerp
+from scipy.spatial import distance_matrix
+from sklearn.decomposition import PCA
+from scipy.interpolate import splprep, splev
+from typing import List
+from sklearn.mixture import GaussianMixture
+def render_gaussians_rgb(generator3DGS, viewpoint_cam, visualize=False):
+    """
+    Simply render gaussians from the generator3DGS from the viewpoint_cam.
+    Args:
+        generator3DGS : instance of the Generator3DGS class from the networks.py file
+        viewpoint_cam : camera instance
+        visualize : boolean flag. If True, will call pyplot function and render image inplace
+    Returns:
+        uint8 numpy array with shape (H, W, 3) representing the image
+    """
+    with torch.no_grad():
+        render_pkg = generator3DGS(viewpoint_cam)
+        image = render_pkg["render"]
+        image_np = image.clone().detach().cpu().numpy().transpose(1, 2, 0)
+        # Clip values to be in the range [0, 1]
+        image_np = np.clip(image_np * 255, 0, 255).astype(np.uint8)
+        if visualize:
+            plt.figure(figsize=(12, 8))
+            plt.imshow(image_np)
+            plt.show()
+        return image_np
+def render_gaussians_D_scores(generator3DGS, viewpoint_cam, mask=None, mask_channel=0, visualize=False):
+    """
+        Simply render D_scores of gaussians from the generator3DGS from the viewpoint_cam.
+        Args:
+            generator3DGS : instance of the Generator3DGS class from the networks.py file
+            viewpoint_cam : camera instance
+            visualize : boolean flag. If True, will call pyplot function and render image inplace
+            mask : optional mask to highlight specific gaussians. Must be of shape (N) where N is the numnber
+                of gaussians in generator3DGS.gaussians. Must be a torch tensor of floats, please scale according
+                to how much color you want to have. Recommended mask value is 10.
+            mask_channel: to which color channel should we add mask
+        Returns:
+            uint8 numpy array with shape (H, W, 3) representing the generator3DGS.gaussians.D_scores rendered as colors
+        """
+    with torch.no_grad():
+        # Visualize D_scores
+        generator3DGS.gaussians._features_dc = generator3DGS.gaussians._features_dc * 1e-4 + \
+                                               torch.stack([generator3DGS.gaussians.D_scores] * 3, axis=-1)
+        generator3DGS.gaussians._features_rest = generator3DGS.gaussians._features_rest * 1e-4
+        if mask is not None:
+            generator3DGS.gaussians._features_dc[..., mask_channel] += mask.unsqueeze(-1)
+        render_pkg = generator3DGS(viewpoint_cam)
+        image = render_pkg["render"]
+        image_np = image.clone().detach().cpu().numpy().transpose(1, 2, 0)
+        # Clip values to be in the range [0, 1]
+        image_np = np.clip(image_np * 255, 0, 255).astype(np.uint8)
+        if visualize:
+            plt.figure(figsize=(12, 8))
+            plt.imshow(image_np)
+            plt.show()
+        if mask is not None:
+            generator3DGS.gaussians._features_dc[..., mask_channel] -= mask.unsqueeze(-1)
+        generator3DGS.gaussians._features_dc = (generator3DGS.gaussians._features_dc - \
+                                                     torch.stack([generator3DGS.gaussians.D_scores] * 3, axis=-1)) * 1e4
+        generator3DGS.gaussians._features_rest = generator3DGS.gaussians._features_rest * 1e4
+        return image_np
+def normalize(v):
+    """
+    Normalize a vector to unit length.
+    Parameters:
+        v (np.ndarray): Input vector.
+    Returns:
+        np.ndarray: Unit vector in the same direction as `v`.
+    """
+    return v / np.linalg.norm(v)
+def look_at_rotation(camera_position: np.ndarray, target: np.ndarray, world_up=np.array([0, 1, 0])):
+    """
+    Compute a rotation matrix for a camera looking at a target point.
+    Parameters:
+        camera_position (np.ndarray): The 3D position of the camera.
+        target (np.ndarray): The point the camera should look at.
+        world_up (np.ndarray): A vector that defines the global 'up' direction.
+    Returns:
+        np.ndarray: A 3x3 rotation matrix (camera-to-world) with columns [right, up, forward].
+    """
+    z_axis = normalize(target - camera_position)         # Forward direction
+    x_axis = normalize(np.cross(world_up, z_axis))       # Right direction
+    y_axis = np.cross(z_axis, x_axis)                    # Recomputed up
+    return np.stack([x_axis, y_axis, z_axis], axis=1)
+def generate_circular_camera_path(existing_cameras: List[Camera], N: int = 12, radius_scale: float = 1.0, d: float = 2.0) -> List[Camera]:
+    """
+    Generate a circular path of cameras around an existing camera group,
+    with each new camera oriented to look at the average viewing direction.
+    Parameters:
+        existing_cameras (List[Camera]): List of existing camera objects to estimate average orientation and layout.
+        N (int): Number of new cameras to generate along the circular path.
+        radius_scale (float): Scale factor to adjust the radius of the circle.
+        d (float): Distance ahead of each camera used to estimate its look-at point.
+    Returns:
+        List[Camera]: A list of newly generated Camera objects forming a circular path and oriented toward a shared view center.
+    """
+    # Step 1: Compute average camera position
+    center = np.mean([cam.T for cam in existing_cameras], axis=0)
+    # Estimate where each camera is looking
+    # d denotes how far ahead each camera sees — you can scale this
+    look_targets = [cam.T + cam.R[:, 2] * d for cam in existing_cameras]
+    center_of_view = np.mean(look_targets, axis=0)
+    # Step 2: Define circular plane basis using fixed up vector
+    avg_forward = normalize(np.mean([cam.R[:, 2] for cam in existing_cameras], axis=0))
+    up_guess = np.array([0, 1, 0])
+    right = normalize(np.cross(avg_forward, up_guess))
+    up = normalize(np.cross(right, avg_forward))
+    # Step 3: Estimate radius
+    avg_radius = np.mean([np.linalg.norm(cam.T - center) for cam in existing_cameras]) * radius_scale
+    # Step 4: Create cameras on a circular path
+    angles = np.linspace(0, 2 * np.pi, N, endpoint=False)
+    reference_cam = existing_cameras[0]
+    new_cameras = []
+    for i, a in enumerate(angles):
+        position = center + avg_radius * (np.cos(a) * right + np.sin(a) * up)
+        if d < 1e-5 or radius_scale < 1e-5:
+            # Use same orientation as the first camera
+            R = reference_cam.R.copy()
+        else:
+            # Change orientation
+            R = look_at_rotation(position, center_of_view)
+        new_cameras.append(Camera(
+            R=R,
+            T=position,                                   # New position
+            FoVx=reference_cam.FoVx,
+            FoVy=reference_cam.FoVy,
+            resolution=(reference_cam.image_width, reference_cam.image_height),
+            colmap_id=-1,
+            depth_params=None,
+            image=Image.fromarray(np.zeros((reference_cam.image_height, reference_cam.image_width, 3), dtype=np.uint8)),
+            invdepthmap=None,
+            image_name=f"circular_a={a:.3f}",
+            uid=i
+        ))
+    return new_cameras
+def save_numpy_frames_as_gif(frames, output_path="animation.gif", duration=100):
+    """
+    Save a list of RGB NumPy frames as a looping GIF animation.
+    Parameters:
+        frames (List[np.ndarray]): List of RGB images as uint8 NumPy arrays (shape HxWx3).
+        output_path (str): Path to save the output GIF.
+        duration (int): Duration per frame in milliseconds.
+    Returns:
+        None
+    """
+    pil_frames = [Image.fromarray(f) for f in frames]
+    pil_frames[0].save(
+        output_path,
+        save_all=True,
+        append_images=pil_frames[1:],
+        duration=duration,  # duration per frame in ms
+        loop=0
+    )
+    print(f"GIF saved to: {output_path}")
+def center_crop_frame(frame: np.ndarray, crop_fraction: float) -> np.ndarray:
+    """
+    Crop the central region of the frame by the given fraction.
+    Parameters:
+        frame (np.ndarray): Input RGB image (H, W, 3).
+        crop_fraction (float): Fraction of the original size to retain (e.g., 0.8 keeps 80%).
+    Returns:
+        np.ndarray: Cropped RGB image.
+    """
+    if crop_fraction >= 1.0:
+        return frame
+    h, w, _ = frame.shape
+    new_h, new_w = int(h * crop_fraction), int(w * crop_fraction)
+    start_y = (h - new_h) // 2
+    start_x = (w - new_w) // 2
+    return frame[start_y:start_y + new_h, start_x:start_x + new_w, :]
+def generate_smooth_closed_camera_path(existing_cameras: List[Camera], N: int = 120, d: float = 2.0, s=.25) -> List[Camera]:
+    """
+    Generate a smooth, closed path interpolating the positions of existing cameras.
+    Parameters:
+        existing_cameras (List[Camera]): List of existing cameras.
+        N (int): Number of points (cameras) to sample along the smooth path.
+        d (float): Distance ahead for estimating the center of view.
+    Returns:
+        List[Camera]: A list of smoothly moving Camera objects along a closed loop.
+    """
+    # Step 1: Extract camera positions
+    positions = np.array([cam.T for cam in existing_cameras])
+    # Step 2: Estimate center of view
+    look_targets = [cam.T + cam.R[:, 2] * d for cam in existing_cameras]
+    center_of_view = np.mean(look_targets, axis=0)
+    # Step 3: Fit a smooth closed spline through the positions
+    positions = np.vstack([positions, positions[0]])  # close the loop
+    tck, u = splprep(positions.T, s=s, per=True)  # periodic=True for closed loop
+    # Step 4: Sample points along the spline
+    u_fine = np.linspace(0, 1, N)
+    smooth_path = np.stack(splev(u_fine, tck), axis=-1)
+    # Step 5: Generate cameras along the smooth path
+    reference_cam = existing_cameras[0]
+    new_cameras = []
+    for i, pos in enumerate(smooth_path):
+        R = look_at_rotation(pos, center_of_view)
+        new_cameras.append(Camera(
+            R=R,
+            T=pos,
+            FoVx=reference_cam.FoVx,
+            FoVy=reference_cam.FoVy,
+            resolution=(reference_cam.image_width, reference_cam.image_height),
+            colmap_id=-1,
+            depth_params=None,
+            image=Image.fromarray(np.zeros((reference_cam.image_height, reference_cam.image_width, 3), dtype=np.uint8)),
+            invdepthmap=None,
+            image_name=f"smooth_path_i={i}",
+            uid=i
+        ))
+    return new_cameras
+def save_numpy_frames_as_mp4(frames, output_path="animation.mp4", fps=10, center_crop: float = 1.0):
+    """
+    Save a list of RGB NumPy frames as an MP4 video with optional center cropping.
+    Parameters:
+        frames (List[np.ndarray]): List of RGB images as uint8 NumPy arrays (shape HxWx3).
+        output_path (str): Path to save the output MP4.
+        fps (int): Frames per second for playback speed.
+        center_crop (float): Fraction (0 < center_crop <= 1.0) of central region to retain.
+                             Use 1.0 for no cropping; 0.8 to crop to 80% center region.
+    Returns:
+        None
+    """
+    with imageio.get_writer(output_path, fps=fps, codec='libx264', quality=8) as writer:
+        for frame in frames:
+            cropped = center_crop_frame(frame, center_crop)
+            writer.append_data(cropped)
+    print(f"MP4 saved to: {output_path}")
+def put_text_on_image(img: np.ndarray, text: str) -> np.ndarray:
+    """
+    Draws multiline white text on a copy of the input image, positioned near the bottom
+    and around 80% of the image width. Handles '\n' characters to split text into multiple lines.
+    Args:
+        img (np.ndarray): Input image as a (H, W, 3) uint8 numpy array.
+        text (str): Text string to draw on the image. Newlines '\n' are treated as line breaks.
+    Returns:
+        np.ndarray: The output image with the text drawn on it.
+    Notes:
+        - The function automatically adjusts line spacing and prevents text from going outside the image.
+        - Text is drawn in white with small font size (0.5) for minimal visual impact.
+    """
+    img = img.copy()
+    height, width, _ = img.shape
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    font_scale = 1.
+    color = (255, 255, 255)
+    thickness = 2
+    line_spacing = 5  # extra pixels between lines
+    lines = text.split('\n')
+    # Precompute the maximum text width to adjust starting x
+    max_text_width = max(cv2.getTextSize(line, font, font_scale, thickness)[0][0] for line in lines)
+    x = int(0.8 * width)
+    x = min(x, width - max_text_width - 30)  # margin on right
+    #x = int(0.03 * width)
+    # Start near the bottom, but move up depending on number of lines
+    total_text_height = len(lines) * (cv2.getTextSize('A', font, font_scale, thickness)[0][1] + line_spacing)
+    y_start = int(height*0.9) - total_text_height  # 30 pixels from bottom
+    for i, line in enumerate(lines):
+        y = y_start + i * (cv2.getTextSize(line, font, font_scale, thickness)[0][1] + line_spacing)
+        cv2.putText(img, line, (x, y), font, font_scale, color, thickness, cv2.LINE_AA)
+    return img
+def catmull_rom_spline(P0, P1, P2, P3, n_points=20):
+    """
+    Compute Catmull-Rom spline segment between P1 and P2.
+    """
+    t = np.linspace(0, 1, n_points)[:, None]
+    M = 0.5 * np.array([
+        [-1,  3, -3, 1],
+        [ 2, -5,  4, -1],
+        [-1,  0,  1, 0],
+        [ 0,  2,  0, 0]
+    ])
+    G = np.stack([P0, P1, P2, P3], axis=0)
+    T = np.concatenate([t**3, t**2, t, np.ones_like(t)], axis=1)
+    return T @ M @ G
+def sort_cameras_pca(existing_cameras: List[Camera]):
+    """
+    Sort cameras along the main PCA axis.
+    """
+    positions = np.array([cam.T for cam in existing_cameras])
+    pca = PCA(n_components=1)
+    scores = pca.fit_transform(positions)
+    sorted_indices = np.argsort(scores[:, 0])
+    return sorted_indices
+def generate_fully_smooth_cameras(existing_cameras: List[Camera],
+                                  n_selected: int = 30,
+                                  n_points_per_segment: int = 20,
+                                  d: float = 2.0,
+                                  closed: bool = False) -> List[Camera]:
+    """
+    Generate a fully smooth camera path using PCA ordering, global Catmull-Rom spline for positions, and global SLERP for orientations.
+    Args:
+        existing_cameras (List[Camera]): List of input cameras.
+        n_selected (int): Number of cameras to select after sorting.
+        n_points_per_segment (int): Number of interpolated points per spline segment.
+        d (float): Distance ahead for estimating center of view.
+        closed (bool): Whether to close the path.
+    Returns:
+        List[Camera]: List of smoothly moving Camera objects.
+    """
+    # 1. Sort cameras along PCA axis
+    sorted_indices = sort_cameras_pca(existing_cameras)
+    sorted_cameras = [existing_cameras[i] for i in sorted_indices]
+    positions = np.array([cam.T for cam in sorted_cameras])
+    # 2. Subsample uniformly
+    idx = np.linspace(0, len(positions) - 1, n_selected).astype(int)
+    sampled_positions = positions[idx]
+    sampled_cameras = [sorted_cameras[i] for i in idx]
+    # 3. Prepare for Catmull-Rom
+    if closed:
+        sampled_positions = np.vstack([sampled_positions[-1], sampled_positions, sampled_positions[0], sampled_positions[1]])
+    else:
+        sampled_positions = np.vstack([sampled_positions[0], sampled_positions, sampled_positions[-1], sampled_positions[-1]])
+    # 4. Generate smooth path positions
+    path_positions = []
+    for i in range(1, len(sampled_positions) - 2):
+        segment = catmull_rom_spline(sampled_positions[i-1], sampled_positions[i], sampled_positions[i+1], sampled_positions[i+2], n_points_per_segment)
+        path_positions.append(segment)
+    path_positions = np.concatenate(path_positions, axis=0)
+    # 5. Global SLERP for rotations
+    rotations = R.from_matrix([cam.R for cam in sampled_cameras])
+    key_times = np.linspace(0, 1, len(rotations))
+    slerp = Slerp(key_times, rotations)
+    query_times = np.linspace(0, 1, len(path_positions))
+    interpolated_rotations = slerp(query_times)
+    # 6. Generate Camera objects
+    reference_cam = existing_cameras[0]
+    smooth_cameras = []
+    for i, pos in enumerate(path_positions):
+        R_interp = interpolated_rotations[i].as_matrix()
+        smooth_cameras.append(Camera(
+            R=R_interp,
+            T=pos,
+            FoVx=reference_cam.FoVx,
+            FoVy=reference_cam.FoVy,
+            resolution=(reference_cam.image_width, reference_cam.image_height),
+            colmap_id=-1,
+            depth_params=None,
+            image=Image.fromarray(np.zeros((reference_cam.image_height, reference_cam.image_width, 3), dtype=np.uint8)),
+            invdepthmap=None,
+            image_name=f"fully_smooth_path_i={i}",
+            uid=i
+        ))
+    return smooth_cameras
+def plot_cameras_and_smooth_path_with_orientation(existing_cameras: List[Camera], smooth_cameras: List[Camera], scale: float = 0.1):
+    """
+    Plot input cameras and smooth path cameras with their orientations in 3D.
+    Args:
+        existing_cameras (List[Camera]): List of original input cameras.
+        smooth_cameras (List[Camera]): List of smooth path cameras.
+        scale (float): Length of orientation arrows.
+    Returns:
+        None
+    """
+    # Input cameras
+    input_positions = np.array([cam.T for cam in existing_cameras])
+    # Smooth cameras
+    smooth_positions = np.array([cam.T for cam in smooth_cameras])
+    fig = go.Figure()
+    # Plot input camera positions
+    fig.add_trace(go.Scatter3d(
+        x=input_positions[:, 0], y=input_positions[:, 1], z=input_positions[:, 2],
+        mode='markers',
+        marker=dict(size=4, color='blue'),
+        name='Input Cameras'
+    ))
+    # Plot smooth path positions
+    fig.add_trace(go.Scatter3d(
+        x=smooth_positions[:, 0], y=smooth_positions[:, 1], z=smooth_positions[:, 2],
+        mode='lines+markers',
+        line=dict(color='red', width=3),
+        marker=dict(size=2, color='red'),
+        name='Smooth Path Cameras'
+    ))
+    # Plot input camera orientations
+    for cam in existing_cameras:
+        origin = cam.T
+        forward = cam.R[:, 2]  # Forward direction
+        fig.add_trace(go.Cone(
+            x=[origin[0]], y=[origin[1]], z=[origin[2]],
+            u=[forward[0]], v=[forward[1]], w=[forward[2]],
+            colorscale=[[0, 'blue'], [1, 'blue']],
+            sizemode="absolute",
+            sizeref=scale,
+            anchor="tail",
+            showscale=False,
+            name='Input Camera Direction'
+        ))
+    # Plot smooth camera orientations
+    for cam in smooth_cameras:
+        origin = cam.T
+        forward = cam.R[:, 2]  # Forward direction
+        fig.add_trace(go.Cone(
+            x=[origin[0]], y=[origin[1]], z=[origin[2]],
+            u=[forward[0]], v=[forward[1]], w=[forward[2]],
+            colorscale=[[0, 'red'], [1, 'red']],
+            sizemode="absolute",
+            sizeref=scale,
+            anchor="tail",
+            showscale=False,
+            name='Smooth Camera Direction'
+        ))
+    fig.update_layout(
+        scene=dict(
+            xaxis_title='X',
+            yaxis_title='Y',
+            zaxis_title='Z',
+            aspectmode='data'
+        ),
+        title="Input Cameras and Smooth Path with Orientations",
+        margin=dict(l=0, r=0, b=0, t=30)
+    )
+    fig.show()
+def solve_tsp_nearest_neighbor(points: np.ndarray):
+    """
+    Solve TSP approximately using nearest neighbor heuristic.
+    Args:
+        points (np.ndarray): (N, 3) array of points.
+    Returns:
+        List[int]: Optimal visiting order of points.
+    """
+    N = points.shape[0]
+    dist = distance_matrix(points, points)
+    visited = [0]
+    unvisited = set(range(1, N))
+    while unvisited:
+        last = visited[-1]
+        next_city = min(unvisited, key=lambda city: dist[last, city])
+        visited.append(next_city)
+        unvisited.remove(next_city)
+    return visited
+def solve_tsp_2opt(points: np.ndarray, n_iter: int = 1000) -> np.ndarray:
+    """
+    Solve TSP approximately using Nearest Neighbor + 2-Opt.
+    Args:
+        points (np.ndarray): Array of shape (N, D) with points.
+        n_iter (int): Number of 2-opt iterations.
+    Returns:
+        np.ndarray: Ordered list of indices.
+    """
+    n_points = points.shape[0]
+    # === 1. Start with Nearest Neighbor
+    unvisited = list(range(n_points))
+    current = unvisited.pop(0)
+    path = [current]
+    while unvisited:
+        dists = np.linalg.norm(points[unvisited] - points[current], axis=1)
+        next_idx = unvisited[np.argmin(dists)]
+        unvisited.remove(next_idx)
+        path.append(next_idx)
+        current = next_idx
+    # === 2. Apply 2-Opt improvements
+    def path_length(path):
+        return np.sum(np.linalg.norm(points[path[i]] - points[path[i+1]], axis=0) for i in range(len(path)-1))
+    best_length = path_length(path)
+    improved = True
+    for _ in range(n_iter):
+        if not improved:
+            break
+        improved = False
+        for i in range(1, n_points - 2):
+            for j in range(i + 1, n_points):
+                if j - i == 1: continue
+                new_path = path[:i] + path[i:j][::-1] + path[j:]
+                new_length = path_length(new_path)
+                if new_length < best_length:
+                    path = new_path
+                    best_length = new_length
+                    improved = True
+                    break
+            if improved:
+                break
+    return np.array(path)
+def generate_fully_smooth_cameras_with_tsp(existing_cameras: List[Camera],
+                                           n_selected: int = 30,
+                                           n_points_per_segment: int = 20,
+                                           d: float = 2.0,
+                                           closed: bool = False) -> List[Camera]:
+    """
+    Generate a fully smooth camera path using TSP ordering, global Catmull-Rom spline for positions, and global SLERP for orientations.
+    Args:
+        existing_cameras (List[Camera]): List of input cameras.
+        n_selected (int): Number of cameras to select after ordering.
+        n_points_per_segment (int): Number of interpolated points per spline segment.
+        d (float): Distance ahead for estimating center of view.
+        closed (bool): Whether to close the path.
+    Returns:
+        List[Camera]: List of smoothly moving Camera objects.
+    """
+    positions = np.array([cam.T for cam in existing_cameras])
+    # 1. Solve approximate TSP
+    order = solve_tsp_nearest_neighbor(positions)
+    ordered_cameras = [existing_cameras[i] for i in order]
+    ordered_positions = positions[order]
+    # 2. Subsample uniformly
+    idx = np.linspace(0, len(ordered_positions) - 1, n_selected).astype(int)
+    sampled_positions = ordered_positions[idx]
+    sampled_cameras = [ordered_cameras[i] for i in idx]
+    # 3. Prepare for Catmull-Rom
+    if closed:
+        sampled_positions = np.vstack([sampled_positions[-1], sampled_positions, sampled_positions[0], sampled_positions[1]])
+    else:
+        sampled_positions = np.vstack([sampled_positions[0], sampled_positions, sampled_positions[-1], sampled_positions[-1]])
+    # 4. Generate smooth path positions
+    path_positions = []
+    for i in range(1, len(sampled_positions) - 2):
+        segment = catmull_rom_spline(sampled_positions[i-1], sampled_positions[i], sampled_positions[i+1], sampled_positions[i+2], n_points_per_segment)
+        path_positions.append(segment)
+    path_positions = np.concatenate(path_positions, axis=0)
+    # 5. Global SLERP for rotations
+    rotations = R.from_matrix([cam.R for cam in sampled_cameras])
+    key_times = np.linspace(0, 1, len(rotations))
+    slerp = Slerp(key_times, rotations)
+    query_times = np.linspace(0, 1, len(path_positions))
+    interpolated_rotations = slerp(query_times)
+    # 6. Generate Camera objects
+    reference_cam = existing_cameras[0]
+    smooth_cameras = []
+    for i, pos in enumerate(path_positions):
+        R_interp = interpolated_rotations[i].as_matrix()
+        smooth_cameras.append(Camera(
+            R=R_interp,
+            T=pos,
+            FoVx=reference_cam.FoVx,
+            FoVy=reference_cam.FoVy,
+            resolution=(reference_cam.image_width, reference_cam.image_height),
+            colmap_id=-1,
+            depth_params=None,
+            image=Image.fromarray(np.zeros((reference_cam.image_height, reference_cam.image_width, 3), dtype=np.uint8)),
+            invdepthmap=None,
+            image_name=f"fully_smooth_path_i={i}",
+            uid=i
+        ))
+    return smooth_cameras
+from typing import List
+import numpy as np
+from sklearn.mixture import GaussianMixture
+from scipy.spatial.transform import Rotation as R, Slerp
+from PIL import Image
+def generate_clustered_smooth_cameras_with_tsp(existing_cameras: List[Camera],
+                                                n_selected: int = 30,
+                                                n_points_per_segment: int = 20,
+                                                d: float = 2.0,
+                                                n_clusters: int = 5,
+                                                closed: bool = False) -> List[Camera]:
+    """
+    Generate a fully smooth camera path using clustering + TSP between nearest cluster centers + TSP inside clusters.
+    Positions are normalized before clustering and denormalized before generating final cameras.
+    Args:
+        existing_cameras (List[Camera]): List of input cameras.
+        n_selected (int): Number of cameras to select after ordering.
+        n_points_per_segment (int): Number of interpolated points per spline segment.
+        d (float): Distance ahead for estimating center of view.
+        n_clusters (int): Number of GMM clusters.
+        closed (bool): Whether to close the path.
+    Returns:
+        List[Camera]: Smooth path of Camera objects.
+    """
+    # Extract positions and rotations
+    positions = np.array([cam.T for cam in existing_cameras])
+    rotations = np.array([R.from_matrix(cam.R).as_quat() for cam in existing_cameras])
+    # === Normalize positions
+    mean_pos = np.mean(positions, axis=0)
+    scale_pos = np.std(positions, axis=0)
+    scale_pos[scale_pos == 0] = 1.0  # avoid division by zero
+    positions_normalized = (positions - mean_pos) / scale_pos
+    # === Features for clustering (only positions, not rotations)
+    features = positions_normalized
+    # === 1. GMM clustering
+    gmm = GaussianMixture(n_components=n_clusters, covariance_type='full', random_state=42)
+    cluster_labels = gmm.fit_predict(features)
+    clusters = {}
+    cluster_centers = []
+    for cluster_id in range(n_clusters):
+        cluster_indices = np.where(cluster_labels == cluster_id)[0]
+        if len(cluster_indices) == 0:
+            continue
+        clusters[cluster_id] = cluster_indices
+        cluster_center = np.mean(features[cluster_indices], axis=0)
+        cluster_centers.append(cluster_center)
+    cluster_centers = np.stack(cluster_centers)
+    # === 2. Remap cluster centers to nearest existing cameras
+    if False:
+        mapped_centers = []
+        for center in cluster_centers:
+            dists = np.linalg.norm(features - center, axis=1)
+            nearest_idx = np.argmin(dists)
+            mapped_centers.append(features[nearest_idx])
+        mapped_centers = np.stack(mapped_centers)
+        cluster_centers = mapped_centers
+    # === 3. Solve TSP between mapped cluster centers
+    cluster_order = solve_tsp_2opt(cluster_centers)
+    # === 4. For each cluster, solve TSP inside cluster
+    final_indices = []
+    for cluster_id in cluster_order:
+        cluster_indices = clusters[cluster_id]
+        cluster_positions = features[cluster_indices]
+        if len(cluster_positions) == 1:
+            final_indices.append(cluster_indices[0])
+            continue
+        local_order = solve_tsp_nearest_neighbor(cluster_positions)
+        ordered_cluster_indices = cluster_indices[local_order]
+        final_indices.extend(ordered_cluster_indices)
+    ordered_cameras = [existing_cameras[i] for i in final_indices]
+    ordered_positions = positions_normalized[final_indices]
+    # === 5. Subsample uniformly
+    idx = np.linspace(0, len(ordered_positions) - 1, n_selected).astype(int)
+    sampled_positions = ordered_positions[idx]
+    sampled_cameras = [ordered_cameras[i] for i in idx]
+    # === 6. Prepare for Catmull-Rom spline
+    if closed:
+        sampled_positions = np.vstack([sampled_positions[-1], sampled_positions, sampled_positions[0], sampled_positions[1]])
+    else:
+        sampled_positions = np.vstack([sampled_positions[0], sampled_positions, sampled_positions[-1], sampled_positions[-1]])
+    # === 7. Smooth path positions
+    path_positions = []
+    for i in range(1, len(sampled_positions) - 2):
+        segment = catmull_rom_spline(sampled_positions[i-1], sampled_positions[i], sampled_positions[i+1], sampled_positions[i+2], n_points_per_segment)
+        path_positions.append(segment)
+    path_positions = np.concatenate(path_positions, axis=0)
+    # === 8. Denormalize
+    path_positions = path_positions * scale_pos + mean_pos
+    # === 9. SLERP for rotations
+    rotations = R.from_matrix([cam.R for cam in sampled_cameras])
+    key_times = np.linspace(0, 1, len(rotations))
+    slerp = Slerp(key_times, rotations)
+    query_times = np.linspace(0, 1, len(path_positions))
+    interpolated_rotations = slerp(query_times)
+    # === 10. Generate Camera objects
+    reference_cam = existing_cameras[0]
+    smooth_cameras = []
+    for i, pos in enumerate(path_positions):
+        R_interp = interpolated_rotations[i].as_matrix()
+        smooth_cameras.append(Camera(
+            R=R_interp,
+            T=pos,
+            FoVx=reference_cam.FoVx,
+            FoVy=reference_cam.FoVy,
+            resolution=(reference_cam.image_width, reference_cam.image_height),
+            colmap_id=-1,
+            depth_params=None,
+            image=Image.fromarray(np.zeros((reference_cam.image_height, reference_cam.image_width, 3), dtype=np.uint8)),
+            invdepthmap=None,
+            image_name=f"clustered_smooth_path_i={i}",
+            uid=i
+        ))
+    return smooth_cameras
+# def generate_clustered_path(existing_cameras: List[Camera],
+#                              n_points_per_segment: int = 20,
+#                              d: float = 2.0,
+#                              n_clusters: int = 5,
+#                              closed: bool = False) -> List[Camera]:
+#     """
+#     Generate a smooth camera path using GMM clustering and TSP on cluster centers.
+#     Args:
+#         existing_cameras (List[Camera]): List of input cameras.
+#         n_points_per_segment (int): Number of interpolated points per spline segment.
+#         d (float): Distance ahead for estimating center of view.
+#         n_clusters (int): Number of GMM clusters (zones).
+#         closed (bool): Whether to close the path.
+#     Returns:
+#         List[Camera]: Smooth path of Camera objects.
+#     """
+#     # Extract positions and rotations
+#     positions = np.array([cam.T for cam in existing_cameras])
+#     # === Normalize positions
+#     mean_pos = np.mean(positions, axis=0)
+#     scale_pos = np.std(positions, axis=0)
+#     scale_pos[scale_pos == 0] = 1.0
+#     positions_normalized = (positions - mean_pos) / scale_pos
+#     # === 1. GMM clustering (only positions)
+#     gmm = GaussianMixture(n_components=n_clusters, covariance_type='full', random_state=42)
+#     cluster_labels = gmm.fit_predict(positions_normalized)
+#     cluster_centers = []
+#     for cluster_id in range(n_clusters):
+#         cluster_indices = np.where(cluster_labels == cluster_id)[0]
+#         if len(cluster_indices) == 0:
+#             continue
+#         cluster_center = np.mean(positions_normalized[cluster_indices], axis=0)
+#         cluster_centers.append(cluster_center)
+#     cluster_centers = np.stack(cluster_centers)
+#     # === 2. Solve TSP between cluster centers
+#     cluster_order = solve_tsp_2opt(cluster_centers)
+#     # === 3. Reorder cluster centers
+#     ordered_centers = cluster_centers[cluster_order]
+#     # === 4. Prepare Catmull-Rom spline
+#     if closed:
+#         ordered_centers = np.vstack([ordered_centers[-1], ordered_centers, ordered_centers[0], ordered_centers[1]])
+#     else:
+#         ordered_centers = np.vstack([ordered_centers[0], ordered_centers, ordered_centers[-1], ordered_centers[-1]])
+#     # === 5. Generate smooth path positions
+#     path_positions = []
+#     for i in range(1, len(ordered_centers) - 2):
+#         segment = catmull_rom_spline(ordered_centers[i-1], ordered_centers[i], ordered_centers[i+1], ordered_centers[i+2], n_points_per_segment)
+#         path_positions.append(segment)
+#     path_positions = np.concatenate(path_positions, axis=0)
+#     # === 6. Denormalize back
+#     path_positions = path_positions * scale_pos + mean_pos
+#     # === 7. Generate dummy rotations (constant forward facing)
+#     reference_cam = existing_cameras[0]
+#     default_rotation = R.from_matrix(reference_cam.R)
+#     # For simplicity, fixed rotation for all
+#     smooth_cameras = []
+#     for i, pos in enumerate(path_positions):
+#         R_interp = default_rotation.as_matrix()
+#         smooth_cameras.append(Camera(
+#             R=R_interp,
+#             T=pos,
+#             FoVx=reference_cam.FoVx,
+#             FoVy=reference_cam.FoVy,
+#             resolution=(reference_cam.image_width, reference_cam.image_height),
+#             colmap_id=-1,
+#             depth_params=None,
+#             image=Image.fromarray(np.zeros((reference_cam.image_height, reference_cam.image_width, 3), dtype=np.uint8)),
+#             invdepthmap=None,
+#             image_name=f"cluster_path_i={i}",
+#             uid=i
+#         ))
+#     return smooth_cameras
+from typing import List
+import numpy as np
+from sklearn.cluster import KMeans
+from scipy.spatial.transform import Rotation as R, Slerp
+from PIL import Image
+def generate_clustered_path(existing_cameras: List[Camera],
+                             n_points_per_segment: int = 20,
+                             d: float = 2.0,
+                             n_clusters: int = 5,
+                             closed: bool = False) -> List[Camera]:
+    """
+    Generate a smooth camera path using K-Means clustering and TSP on cluster centers.
+    Args:
+        existing_cameras (List[Camera]): List of input cameras.
+        n_points_per_segment (int): Number of interpolated points per spline segment.
+        d (float): Distance ahead for estimating center of view.
+        n_clusters (int): Number of KMeans clusters (zones).
+        closed (bool): Whether to close the path.
+    Returns:
+        List[Camera]: Smooth path of Camera objects.
+    """
+    # Extract positions
+    positions = np.array([cam.T for cam in existing_cameras])
+    # === Normalize positions
+    mean_pos = np.mean(positions, axis=0)
+    scale_pos = np.std(positions, axis=0)
+    scale_pos[scale_pos == 0] = 1.0
+    positions_normalized = (positions - mean_pos) / scale_pos
+    # === 1. K-Means clustering (only positions)
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
+    cluster_labels = kmeans.fit_predict(positions_normalized)
+    cluster_centers = []
+    for cluster_id in range(n_clusters):
+        cluster_indices = np.where(cluster_labels == cluster_id)[0]
+        if len(cluster_indices) == 0:
+            continue
+        cluster_center = np.mean(positions_normalized[cluster_indices], axis=0)
+        cluster_centers.append(cluster_center)
+    cluster_centers = np.stack(cluster_centers)
+    # === 2. Solve TSP between cluster centers
+    cluster_order = solve_tsp_2opt(cluster_centers)
+    # === 3. Reorder cluster centers
+    ordered_centers = cluster_centers[cluster_order]
+    # === 4. Prepare Catmull-Rom spline
+    if closed:
+        ordered_centers = np.vstack([ordered_centers[-1], ordered_centers, ordered_centers[0], ordered_centers[1]])
+    else:
+        ordered_centers = np.vstack([ordered_centers[0], ordered_centers, ordered_centers[-1], ordered_centers[-1]])
+    # === 5. Generate smooth path positions
+    path_positions = []
+    for i in range(1, len(ordered_centers) - 2):
+        segment = catmull_rom_spline(ordered_centers[i-1], ordered_centers[i], ordered_centers[i+1], ordered_centers[i+2], n_points_per_segment)
+        path_positions.append(segment)
+    path_positions = np.concatenate(path_positions, axis=0)
+    # === 6. Denormalize back
+    path_positions = path_positions * scale_pos + mean_pos
+    # === 7. Generate dummy rotations (constant forward facing)
+    reference_cam = existing_cameras[0]
+    default_rotation = R.from_matrix(reference_cam.R)
+    # For simplicity, fixed rotation for all
+    smooth_cameras = []
+    for i, pos in enumerate(path_positions):
+        R_interp = default_rotation.as_matrix()
+        smooth_cameras.append(Camera(
+            R=R_interp,
+            T=pos,
+            FoVx=reference_cam.FoVx,
+            FoVy=reference_cam.FoVy,
+            resolution=(reference_cam.image_width, reference_cam.image_height),
+            colmap_id=-1,
+            depth_params=None,
+            image=Image.fromarray(np.zeros((reference_cam.image_height, reference_cam.image_width, 3), dtype=np.uint8)),
+            invdepthmap=None,
+            image_name=f"cluster_path_i={i}",
+            uid=i
+        ))
+    return smooth_cameras
+def visualize_image_with_points(image, points):
+    """
+    Visualize an image with points overlaid on top. This is useful for correspondences visualizations
+    Parameters:
+    - image: PIL Image object
+    - points: Numpy array of shape [N, 2] containing (x, y) coordinates of points
+    Returns:
+    - None (displays the visualization)
+    """
+    # Convert PIL image to numpy array
+    img_array = np.array(image)
+    # Create a figure and axis
+    fig, ax = plt.subplots(figsize=(7,7))
+    # Display the image
+    ax.imshow(img_array)
+    # Scatter plot the points on top of the image
+    ax.scatter(points[:, 0], points[:, 1], color='red', marker='o', s=1)
+    # Show the plot
+    plt.show()
+def visualize_correspondences(image1, points1, image2, points2):
+    """
+    Visualize two images concatenated horizontally with key points and correspondences.
+    Parameters:
+    - image1: PIL Image object (left image)
+    - points1: Numpy array of shape [N, 2] containing (x, y) coordinates of key points for image1
+    - image2: PIL Image object (right image)
+    - points2: Numpy array of shape [N, 2] containing (x, y) coordinates of key points for image2
+    Returns:
+    - None (displays the visualization)
+    """
+    # Concatenate images horizontally
+    concatenated_image = np.concatenate((np.array(image1), np.array(image2)), axis=1)
+    # Create a figure and axis
+    fig, ax = plt.subplots(figsize=(10,10))
+    # Display the concatenated image
+    ax.imshow(concatenated_image)
+    # Plot key points on the left image
+    ax.scatter(points1[:, 0], points1[:, 1], color='red', marker='o', s=10)
+    # Plot key points on the right image
+    ax.scatter(points2[:, 0] + image1.width, points2[:, 1], color='blue', marker='o', s=10)
+    # Draw lines connecting corresponding key points
+    for i in range(len(points1)):
+        ax.plot([points1[i, 0], points2[i, 0] + image1.width], [points1[i, 1], points2[i, 1]])#, color='green')
+    # Show the plot
+    plt.show()

submodules/RoMa/.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+*.egg-info*
+*.vscode*
+*__pycache__*
+vis*
+workspace*
+.venv
+.DS_Store
+jobs/*
+*ignore_me*
+*.pth
+wandb*

submodules/RoMa/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Johan Edstedt
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

submodules/RoMa/README.md ADDED Viewed

	@@ -0,0 +1,123 @@

+#
+<p align="center">
+  <h1 align="center"> <ins>RoMa</ins> 🏛️:<br> Robust Dense Feature Matching <br> ⭐CVPR 2024⭐</h1>
+  <p align="center">
+    <a href="https://scholar.google.com/citations?user=Ul-vMR0AAAAJ">Johan Edstedt</a>
+    ·
+    <a href="https://scholar.google.com/citations?user=HS2WuHkAAAAJ">Qiyu Sun</a>
+    ·
+    <a href="https://scholar.google.com/citations?user=FUE3Wd0AAAAJ">Georg Bökman</a>
+    ·
+    <a href="https://scholar.google.com/citations?user=6WRQpCQAAAAJ">Mårten Wadenbäck</a>
+    ·
+    <a href="https://scholar.google.com/citations?user=lkWfR08AAAAJ">Michael Felsberg</a>
+  </p>
+  <h2 align="center"><p>
+    <a href="https://arxiv.org/abs/2305.15404" align="center">Paper</a> |
+    <a href="https://parskatt.github.io/RoMa" align="center">Project Page</a>
+  </p></h2>
+  <div align="center"></div>
+</p>
+<br/>
+<p align="center">
+    <img src="https://github.com/Parskatt/RoMa/assets/22053118/15d8fea7-aa6d-479f-8a93-350d950d006b" alt="example" width=80%>
+    <br>
+    <em>RoMa is the robust dense feature matcher capable of estimating pixel-dense warps and reliable certainties for almost any image pair.</em>
+</p>
+## Setup/Install
+In your python environment (tested on Linux python 3.10), run:
+```bash
+pip install -e .
+```
+## Demo / How to Use
+We provide two demos in the [demos folder](demo).
+Here's the gist of it:
+```python
+from romatch import roma_outdoor
+roma_model = roma_outdoor(device=device)
+# Match
+warp, certainty = roma_model.match(imA_path, imB_path, device=device)
+# Sample matches for estimation
+matches, certainty = roma_model.sample(warp, certainty)
+# Convert to pixel coordinates (RoMa produces matches in [-1,1]x[-1,1])
+kptsA, kptsB = roma_model.to_pixel_coordinates(matches, H_A, W_A, H_B, W_B)
+# Find a fundamental matrix (or anything else of interest)
+F, mask = cv2.findFundamentalMat(
+    kptsA.cpu().numpy(), kptsB.cpu().numpy(), ransacReprojThreshold=0.2, method=cv2.USAC_MAGSAC, confidence=0.999999, maxIters=10000
+)
+```
+**New**: You can also match arbitrary keypoints with RoMa. See [match_keypoints](romatch/models/matcher.py) in RegressionMatcher.
+## Settings
+### Resolution
+By default RoMa uses an initial resolution of (560,560) which is then upsampled to (864,864).
+You can change this at construction (see roma_outdoor kwargs).
+You can also change this later, by changing the roma_model.w_resized, roma_model.h_resized, and roma_model.upsample_res.
+### Sampling
+roma_model.sample_thresh controls the thresholding used when sampling matches for estimation. In certain cases a lower or higher threshold may improve results.
+## Reproducing Results
+The experiments in the paper are provided in the [experiments folder](experiments).
+### Training
+1. First follow the instructions provided here: https://github.com/Parskatt/DKM for downloading and preprocessing datasets.
+2. Run the relevant experiment, e.g.,
+```bash
+torchrun --nproc_per_node=4 --nnodes=1 --rdzv_backend=c10d experiments/roma_outdoor.py
+```
+### Testing
+```bash
+python experiments/roma_outdoor.py --only_test --benchmark mega-1500
+```
+## License
+All our code except DINOv2 is MIT license.
+DINOv2 has an Apache 2 license [DINOv2](https://github.com/facebookresearch/dinov2/blob/main/LICENSE).
+## Acknowledgement
+Our codebase builds on the code in [DKM](https://github.com/Parskatt/DKM).
+## Tiny RoMa
+If you find that RoMa is too heavy, you might want to try Tiny RoMa which is built on top of XFeat.
+```python
+from romatch import tiny_roma_v1_outdoor
+tiny_roma_model = tiny_roma_v1_outdoor(device=device)
+```
+Mega1500:
+|  | AUC@5 | AUC@10 | AUC@20 |
+|----------|----------|----------|----------|
+| XFeat    | 46.4    | 58.9    | 69.2    |
+| XFeat*    |  51.9   | 67.2    | 78.9    |
+| Tiny RoMa v1    | 56.4 | 69.5 | 79.5     |
+| RoMa    |  -   | -    | -    |
+Mega-8-Scenes (See DKM):
+|  | AUC@5 | AUC@10 | AUC@20 |
+|----------|----------|----------|----------|
+| XFeat    | -    | -    | -    |
+| XFeat*    |  50.1   | 64.4    | 75.2    |
+| Tiny RoMa v1    | 57.7 | 70.5 | 79.6     |
+| RoMa    |  -   | -    | -    |
+IMC22 :'):
+|  | mAA@10 |
+|----------|----------|
+| XFeat    | 42.1    |
+| XFeat*    |  -   |
+| Tiny RoMa v1    | 42.2 |
+| RoMa    |  -   |
+## BibTeX
+If you find our models useful, please consider citing our paper!
+```
+@article{edstedt2024roma,
+title={{RoMa: Robust Dense Feature Matching}},
+author={Edstedt, Johan and Sun, Qiyu and Bökman, Georg and Wadenbäck, Mårten and Felsberg, Michael},
+journal={IEEE Conference on Computer Vision and Pattern Recognition},
+year={2024}
+}
+```

submodules/RoMa/data/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *
2	+ !.gitignore

submodules/RoMa/demo/demo_3D_effect.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from PIL import Image
+import torch
+import torch.nn.functional as F
+import numpy as np
+from romatch.utils.utils import tensor_to_pil
+from romatch import roma_outdoor
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+if torch.backends.mps.is_available():
+    device = torch.device('mps')
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument("--im_A_path", default="assets/toronto_A.jpg", type=str)
+    parser.add_argument("--im_B_path", default="assets/toronto_B.jpg", type=str)
+    parser.add_argument("--save_path", default="demo/gif/roma_warp_toronto", type=str)
+    args, _ = parser.parse_known_args()
+    im1_path = args.im_A_path
+    im2_path = args.im_B_path
+    save_path = args.save_path
+    # Create model
+    roma_model = roma_outdoor(device=device, coarse_res=560, upsample_res=(864, 1152))
+    roma_model.symmetric = False
+    H, W = roma_model.get_output_resolution()
+    im1 = Image.open(im1_path).resize((W, H))
+    im2 = Image.open(im2_path).resize((W, H))
+    # Match
+    warp, certainty = roma_model.match(im1_path, im2_path, device=device)
+    # Sampling not needed, but can be done with model.sample(warp, certainty)
+    x1 = (torch.tensor(np.array(im1)) / 255).to(device).permute(2, 0, 1)
+    x2 = (torch.tensor(np.array(im2)) / 255).to(device).permute(2, 0, 1)
+    coords_A, coords_B = warp[...,:2], warp[...,2:]
+    for i, x in enumerate(np.linspace(0,2*np.pi,200)):
+        t = (1 + np.cos(x))/2
+        interp_warp = (1-t)*coords_A + t*coords_B
+        im2_transfer_rgb = F.grid_sample(
+        x2[None], interp_warp[None], mode="bilinear", align_corners=False
+        )[0]
+        tensor_to_pil(im2_transfer_rgb, unnormalize=False).save(f"{save_path}_{i:03d}.jpg")

submodules/RoMa/demo/demo_fundamental.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from PIL import Image
+import torch
+import cv2
+from romatch import roma_outdoor
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+if torch.backends.mps.is_available():
+    device = torch.device('mps')
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument("--im_A_path", default="assets/sacre_coeur_A.jpg", type=str)
+    parser.add_argument("--im_B_path", default="assets/sacre_coeur_B.jpg", type=str)
+    args, _ = parser.parse_known_args()
+    im1_path = args.im_A_path
+    im2_path = args.im_B_path
+    # Create model
+    roma_model = roma_outdoor(device=device)
+    W_A, H_A = Image.open(im1_path).size
+    W_B, H_B = Image.open(im2_path).size
+    # Match
+    warp, certainty = roma_model.match(im1_path, im2_path, device=device)
+    # Sample matches for estimation
+    matches, certainty = roma_model.sample(warp, certainty)
+    kpts1, kpts2 = roma_model.to_pixel_coordinates(matches, H_A, W_A, H_B, W_B)
+    F, mask = cv2.findFundamentalMat(
+        kpts1.cpu().numpy(), kpts2.cpu().numpy(), ransacReprojThreshold=0.2, method=cv2.USAC_MAGSAC, confidence=0.999999, maxIters=10000
+    )

submodules/RoMa/demo/demo_match.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
+import torch
+from PIL import Image
+import torch.nn.functional as F
+import numpy as np
+from romatch.utils.utils import tensor_to_pil
+from romatch import roma_outdoor
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+if torch.backends.mps.is_available():
+    device = torch.device('mps')
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument("--im_A_path", default="assets/toronto_A.jpg", type=str)
+    parser.add_argument("--im_B_path", default="assets/toronto_B.jpg", type=str)
+    parser.add_argument("--save_path", default="demo/roma_warp_toronto.jpg", type=str)
+    args, _ = parser.parse_known_args()
+    im1_path = args.im_A_path
+    im2_path = args.im_B_path
+    save_path = args.save_path
+    # Create model
+    roma_model = roma_outdoor(device=device, coarse_res=560, upsample_res=(864, 1152))
+    H, W = roma_model.get_output_resolution()
+    im1 = Image.open(im1_path).resize((W, H))
+    im2 = Image.open(im2_path).resize((W, H))
+    # Match
+    warp, certainty = roma_model.match(im1_path, im2_path, device=device)
+    # Sampling not needed, but can be done with model.sample(warp, certainty)
+    x1 = (torch.tensor(np.array(im1)) / 255).to(device).permute(2, 0, 1)
+    x2 = (torch.tensor(np.array(im2)) / 255).to(device).permute(2, 0, 1)
+    im2_transfer_rgb = F.grid_sample(
+    x2[None], warp[:,:W, 2:][None], mode="bilinear", align_corners=False
+    )[0]
+    im1_transfer_rgb = F.grid_sample(
+    x1[None], warp[:, W:, :2][None], mode="bilinear", align_corners=False
+    )[0]
+    warp_im = torch.cat((im2_transfer_rgb,im1_transfer_rgb),dim=2)
+    white_im = torch.ones((H,2*W),device=device)
+    vis_im = certainty * warp_im + (1 - certainty) * white_im
+    tensor_to_pil(vis_im, unnormalize=False).save(save_path)

submodules/RoMa/demo/demo_match_opencv_sift.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from PIL import Image
+import numpy as np
+import numpy as np
+import cv2 as cv
+import matplotlib.pyplot as plt
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument("--im_A_path", default="assets/toronto_A.jpg", type=str)
+    parser.add_argument("--im_B_path", default="assets/toronto_B.jpg", type=str)
+    parser.add_argument("--save_path", default="demo/roma_warp_toronto.jpg", type=str)
+    args, _ = parser.parse_known_args()
+    im1_path = args.im_A_path
+    im2_path = args.im_B_path
+    save_path = args.save_path
+    img1 = cv.imread(im1_path,cv.IMREAD_GRAYSCALE)          # queryImage
+    img2 = cv.imread(im2_path,cv.IMREAD_GRAYSCALE) # trainImage
+    # Initiate SIFT detector
+    sift = cv.SIFT_create()
+    # find the keypoints and descriptors with SIFT
+    kp1, des1 = sift.detectAndCompute(img1,None)
+    kp2, des2 = sift.detectAndCompute(img2,None)
+    # BFMatcher with default params
+    bf = cv.BFMatcher()
+    matches = bf.knnMatch(des1,des2,k=2)
+    # Apply ratio test
+    good = []
+    for m,n in matches:
+        if m.distance < 0.75*n.distance:
+            good.append([m])
+    # cv.drawMatchesKnn expects list of lists as matches.
+    draw_params = dict(matchColor = (255,0,0), # draw matches in red color
+                   singlePointColor = None,
+                   flags = 2)
+    img3 = cv.drawMatchesKnn(img1,kp1,img2,kp2,good,None,**draw_params)
+    Image.fromarray(img3).save("demo/sift_matches.png")

submodules/RoMa/demo/demo_match_tiny.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
+import torch
+from PIL import Image
+import torch.nn.functional as F
+import numpy as np
+from romatch.utils.utils import tensor_to_pil
+from romatch import tiny_roma_v1_outdoor
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+if torch.backends.mps.is_available():
+    device = torch.device('mps')
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument("--im_A_path", default="assets/sacre_coeur_A.jpg", type=str)
+    parser.add_argument("--im_B_path", default="assets/sacre_coeur_B.jpg", type=str)
+    parser.add_argument("--save_A_path", default="demo/tiny_roma_warp_A.jpg", type=str)
+    parser.add_argument("--save_B_path", default="demo/tiny_roma_warp_B.jpg", type=str)
+    args, _ = parser.parse_known_args()
+    im1_path = args.im_A_path
+    im2_path = args.im_B_path
+    # Create model
+    roma_model = tiny_roma_v1_outdoor(device=device)
+    # Match
+    warp, certainty1 = roma_model.match(im1_path, im2_path)
+    h1, w1 = warp.shape[:2]
+    # maybe im1.size != im2.size
+    im1 = Image.open(im1_path).resize((w1, h1))
+    im2 = Image.open(im2_path)
+    x1 = (torch.tensor(np.array(im1)) / 255).to(device).permute(2, 0, 1)
+    x2 = (torch.tensor(np.array(im2)) / 255).to(device).permute(2, 0, 1)
+    h2, w2 = x2.shape[1:]
+    g1_p2x = w2 / 2 * (warp[..., 2] + 1)
+    g1_p2y = h2 / 2 * (warp[..., 3] + 1)
+    g2_p1x = torch.zeros((h2, w2), dtype=torch.float32).to(device) - 2
+    g2_p1y = torch.zeros((h2, w2), dtype=torch.float32).to(device) - 2
+    x, y = torch.meshgrid(
+        torch.arange(w1, device=device),
+        torch.arange(h1, device=device),
+        indexing="xy",
+    )
+    g2x = torch.round(g1_p2x[y, x]).long()
+    g2y = torch.round(g1_p2y[y, x]).long()
+    idx_x = torch.bitwise_and(0 <= g2x, g2x < w2)
+    idx_y = torch.bitwise_and(0 <= g2y, g2y < h2)
+    idx = torch.bitwise_and(idx_x, idx_y)
+    g2_p1x[g2y[idx], g2x[idx]] = x[idx].float() * 2 / w1 - 1
+    g2_p1y[g2y[idx], g2x[idx]] = y[idx].float() * 2 / h1 - 1
+    certainty2 = F.grid_sample(
+        certainty1[None][None],
+        torch.stack([g2_p1x, g2_p1y], dim=2)[None],
+        mode="bilinear",
+        align_corners=False,
+    )[0]
+    white_im1 = torch.ones((h1, w1), device = device)
+    white_im2 = torch.ones((h2, w2), device = device)
+    certainty1 = F.avg_pool2d(certainty1[None], kernel_size=5, stride=1, padding=2)[0]
+    certainty2 = F.avg_pool2d(certainty2[None], kernel_size=5, stride=1, padding=2)[0]
+    vis_im1 = certainty1 * x1 + (1 - certainty1) * white_im1
+    vis_im2 = certainty2 * x2 + (1 - certainty2) * white_im2
+    tensor_to_pil(vis_im1, unnormalize=False).save(args.save_A_path)
+    tensor_to_pil(vis_im2, unnormalize=False).save(args.save_B_path)

submodules/RoMa/demo/gif/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *
2	+ !.gitignore

submodules/RoMa/experiments/eval_roma_outdoor.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import json
+from romatch.benchmarks import MegadepthDenseBenchmark
+from romatch.benchmarks import MegaDepthPoseEstimationBenchmark, HpatchesHomogBenchmark
+from romatch.benchmarks import Mega1500PoseLibBenchmark
+def test_mega_8_scenes(model, name):
+    mega_8_scenes_benchmark = MegaDepthPoseEstimationBenchmark("data/megadepth",
+                                                scene_names=['mega_8_scenes_0019_0.1_0.3.npz',
+                                                    'mega_8_scenes_0025_0.1_0.3.npz',
+                                                    'mega_8_scenes_0021_0.1_0.3.npz',
+                                                    'mega_8_scenes_0008_0.1_0.3.npz',
+                                                    'mega_8_scenes_0032_0.1_0.3.npz',
+                                                    'mega_8_scenes_1589_0.1_0.3.npz',
+                                                    'mega_8_scenes_0063_0.1_0.3.npz',
+                                                    'mega_8_scenes_0024_0.1_0.3.npz',
+                                                    'mega_8_scenes_0019_0.3_0.5.npz',
+                                                    'mega_8_scenes_0025_0.3_0.5.npz',
+                                                    'mega_8_scenes_0021_0.3_0.5.npz',
+                                                    'mega_8_scenes_0008_0.3_0.5.npz',
+                                                    'mega_8_scenes_0032_0.3_0.5.npz',
+                                                    'mega_8_scenes_1589_0.3_0.5.npz',
+                                                    'mega_8_scenes_0063_0.3_0.5.npz',
+                                                    'mega_8_scenes_0024_0.3_0.5.npz'])
+    mega_8_scenes_results = mega_8_scenes_benchmark.benchmark(model, model_name=name)
+    print(mega_8_scenes_results)
+    json.dump(mega_8_scenes_results, open(f"results/mega_8_scenes_{name}.json", "w"))
+def test_mega1500(model, name):
+    mega1500_benchmark = MegaDepthPoseEstimationBenchmark("data/megadepth")
+    mega1500_results = mega1500_benchmark.benchmark(model, model_name=name)
+    json.dump(mega1500_results, open(f"results/mega1500_{name}.json", "w"))
+def test_mega1500_poselib(model, name):
+    mega1500_benchmark = Mega1500PoseLibBenchmark("data/megadepth")
+    mega1500_results = mega1500_benchmark.benchmark(model, model_name=name)
+    json.dump(mega1500_results, open(f"results/mega1500_{name}.json", "w"))
+def test_mega_dense(model, name):
+    megadense_benchmark = MegadepthDenseBenchmark("data/megadepth", num_samples = 1000)
+    megadense_results = megadense_benchmark.benchmark(model)
+    json.dump(megadense_results, open(f"results/mega_dense_{name}.json", "w"))
+def test_hpatches(model, name):
+    hpatches_benchmark = HpatchesHomogBenchmark("data/hpatches")
+    hpatches_results = hpatches_benchmark.benchmark(model)
+    json.dump(hpatches_results, open(f"results/hpatches_{name}.json", "w"))
+if __name__ == "__main__":
+    from romatch import roma_outdoor
+    device = "cuda"
+    model = roma_outdoor(device = device, coarse_res = 672, upsample_res = 1344)
+    experiment_name = "roma_latest"
+    test_mega1500(model, experiment_name)
+    #test_mega1500_poselib(model, experiment_name)

submodules/RoMa/experiments/eval_tiny_roma_v1_outdoor.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch
+import os
+from pathlib import Path
+import json
+from romatch.benchmarks import ScanNetBenchmark
+from romatch.benchmarks import Mega1500PoseLibBenchmark, ScanNetPoselibBenchmark
+from romatch.benchmarks import MegaDepthPoseEstimationBenchmark
+def test_mega_8_scenes(model, name):
+    mega_8_scenes_benchmark = MegaDepthPoseEstimationBenchmark("data/megadepth",
+                                                scene_names=['mega_8_scenes_0019_0.1_0.3.npz',
+                                                    'mega_8_scenes_0025_0.1_0.3.npz',
+                                                    'mega_8_scenes_0021_0.1_0.3.npz',
+                                                    'mega_8_scenes_0008_0.1_0.3.npz',
+                                                    'mega_8_scenes_0032_0.1_0.3.npz',
+                                                    'mega_8_scenes_1589_0.1_0.3.npz',
+                                                    'mega_8_scenes_0063_0.1_0.3.npz',
+                                                    'mega_8_scenes_0024_0.1_0.3.npz',
+                                                    'mega_8_scenes_0019_0.3_0.5.npz',
+                                                    'mega_8_scenes_0025_0.3_0.5.npz',
+                                                    'mega_8_scenes_0021_0.3_0.5.npz',
+                                                    'mega_8_scenes_0008_0.3_0.5.npz',
+                                                    'mega_8_scenes_0032_0.3_0.5.npz',
+                                                    'mega_8_scenes_1589_0.3_0.5.npz',
+                                                    'mega_8_scenes_0063_0.3_0.5.npz',
+                                                    'mega_8_scenes_0024_0.3_0.5.npz'])
+    mega_8_scenes_results = mega_8_scenes_benchmark.benchmark(model, model_name=name)
+    print(mega_8_scenes_results)
+    json.dump(mega_8_scenes_results, open(f"results/mega_8_scenes_{name}.json", "w"))
+def test_mega1500(model, name):
+    mega1500_benchmark = MegaDepthPoseEstimationBenchmark("data/megadepth")
+    mega1500_results = mega1500_benchmark.benchmark(model, model_name=name)
+    json.dump(mega1500_results, open(f"results/mega1500_{name}.json", "w"))
+def test_mega1500_poselib(model, name):
+    #model.exact_softmax = True
+    mega1500_benchmark = Mega1500PoseLibBenchmark("data/megadepth", num_ransac_iter = 1, test_every = 1)
+    mega1500_results = mega1500_benchmark.benchmark(model, model_name=name)
+    json.dump(mega1500_results, open(f"results/mega1500_poselib_{name}.json", "w"))
+def test_mega_8_scenes_poselib(model, name):
+    mega1500_benchmark = Mega1500PoseLibBenchmark("data/megadepth", num_ransac_iter = 1, test_every = 1,
+                                                  scene_names=['mega_8_scenes_0019_0.1_0.3.npz',
+                                                    'mega_8_scenes_0025_0.1_0.3.npz',
+                                                    'mega_8_scenes_0021_0.1_0.3.npz',
+                                                    'mega_8_scenes_0008_0.1_0.3.npz',
+                                                    'mega_8_scenes_0032_0.1_0.3.npz',
+                                                    'mega_8_scenes_1589_0.1_0.3.npz',
+                                                    'mega_8_scenes_0063_0.1_0.3.npz',
+                                                    'mega_8_scenes_0024_0.1_0.3.npz',
+                                                    'mega_8_scenes_0019_0.3_0.5.npz',
+                                                    'mega_8_scenes_0025_0.3_0.5.npz',
+                                                    'mega_8_scenes_0021_0.3_0.5.npz',
+                                                    'mega_8_scenes_0008_0.3_0.5.npz',
+                                                    'mega_8_scenes_0032_0.3_0.5.npz',
+                                                    'mega_8_scenes_1589_0.3_0.5.npz',
+                                                    'mega_8_scenes_0063_0.3_0.5.npz',
+                                                    'mega_8_scenes_0024_0.3_0.5.npz'])
+    mega1500_results = mega1500_benchmark.benchmark(model, model_name=name)
+    json.dump(mega1500_results, open(f"results/mega_8_scenes_poselib_{name}.json", "w"))
+def test_scannet_poselib(model, name):
+    scannet_benchmark = ScanNetPoselibBenchmark("data/scannet")
+    scannet_results = scannet_benchmark.benchmark(model)
+    json.dump(scannet_results, open(f"results/scannet_{name}.json", "w"))
+def test_scannet(model, name):
+    scannet_benchmark = ScanNetBenchmark("data/scannet")
+    scannet_results = scannet_benchmark.benchmark(model)
+    json.dump(scannet_results, open(f"results/scannet_{name}.json", "w"))
+if __name__ == "__main__":
+    os.environ["TORCH_CUDNN_V8_API_ENABLED"] = "1" # For BF16 computations
+    os.environ["OMP_NUM_THREADS"] = "16"
+    torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
+    from romatch import tiny_roma_v1_outdoor
+    experiment_name = Path(__file__).stem
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = tiny_roma_v1_outdoor(device)
+    #test_mega1500_poselib(model, experiment_name)
+    test_mega_8_scenes_poselib(model, experiment_name)

submodules/RoMa/experiments/roma_indoor.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import os
+import torch
+from argparse import ArgumentParser
+from torch import nn
+from torch.utils.data import ConcatDataset
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import json
+import wandb
+from tqdm import tqdm
+from romatch.benchmarks import MegadepthDenseBenchmark
+from romatch.datasets.megadepth import MegadepthBuilder
+from romatch.datasets.scannet import ScanNetBuilder
+from romatch.losses.robust_loss import RobustLosses
+from romatch.benchmarks import MegadepthDenseBenchmark, ScanNetBenchmark
+from romatch.train.train import train_k_steps
+from romatch.models.matcher import *
+from romatch.models.transformer import Block, TransformerDecoder, MemEffAttention
+from romatch.models.encoders import *
+from romatch.checkpointing import CheckPoint
+resolutions = {"low":(448, 448), "medium":(14*8*5, 14*8*5), "high":(14*8*6, 14*8*6)}
+def get_model(pretrained_backbone=True, resolution = "medium", **kwargs):
+    gp_dim = 512
+    feat_dim = 512
+    decoder_dim = gp_dim + feat_dim
+    cls_to_coord_res = 64
+    coordinate_decoder = TransformerDecoder(
+        nn.Sequential(*[Block(decoder_dim, 8, attn_class=MemEffAttention) for _ in range(5)]),
+        decoder_dim,
+        cls_to_coord_res**2 + 1,
+        is_classifier=True,
+        amp = True,
+        pos_enc = False,)
+    dw = True
+    hidden_blocks = 8
+    kernel_size = 5
+    displacement_emb = "linear"
+    disable_local_corr_grad = True
+    conv_refiner = nn.ModuleDict(
+        {
+            "16": ConvRefiner(
+                2 * 512+128+(2*7+1)**2,
+                2 * 512+128+(2*7+1)**2,
+                2 + 1,
+                kernel_size=kernel_size,
+                dw=dw,
+                hidden_blocks=hidden_blocks,
+                displacement_emb=displacement_emb,
+                displacement_emb_dim=128,
+                local_corr_radius = 7,
+                corr_in_other = True,
+                amp = True,
+                disable_local_corr_grad = disable_local_corr_grad,
+                bn_momentum = 0.01,
+            ),
+            "8": ConvRefiner(
+                2 * 512+64+(2*3+1)**2,
+                2 * 512+64+(2*3+1)**2,
+                2 + 1,
+                kernel_size=kernel_size,
+                dw=dw,
+                hidden_blocks=hidden_blocks,
+                displacement_emb=displacement_emb,
+                displacement_emb_dim=64,
+                local_corr_radius = 3,
+                corr_in_other = True,
+                amp = True,
+                disable_local_corr_grad = disable_local_corr_grad,
+                bn_momentum = 0.01,
+            ),
+            "4": ConvRefiner(
+                2 * 256+32+(2*2+1)**2,
+                2 * 256+32+(2*2+1)**2,
+                2 + 1,
+                kernel_size=kernel_size,
+                dw=dw,
+                hidden_blocks=hidden_blocks,
+                displacement_emb=displacement_emb,
+                displacement_emb_dim=32,
+                local_corr_radius = 2,
+                corr_in_other = True,
+                amp = True,
+                disable_local_corr_grad = disable_local_corr_grad,
+                bn_momentum = 0.01,
+            ),
+            "2": ConvRefiner(
+                2 * 64+16,
+                128+16,
+                2 + 1,
+                kernel_size=kernel_size,
+                dw=dw,
+                hidden_blocks=hidden_blocks,
+                displacement_emb=displacement_emb,
+                displacement_emb_dim=16,
+                amp = True,
+                disable_local_corr_grad = disable_local_corr_grad,
+                bn_momentum = 0.01,
+            ),
+            "1": ConvRefiner(
+                2 * 9 + 6,
+                24,
+                2 + 1,
+                kernel_size=kernel_size,
+                dw=dw,
+                hidden_blocks = hidden_blocks,
+                displacement_emb = displacement_emb,
+                displacement_emb_dim = 6,
+                amp = True,
+                disable_local_corr_grad = disable_local_corr_grad,
+                bn_momentum = 0.01,
+            ),
+        }
+    )
+    kernel_temperature = 0.2
+    learn_temperature = False
+    no_cov = True
+    kernel = CosKernel
+    only_attention = False
+    basis = "fourier"
+    gp16 = GP(
+        kernel,
+        T=kernel_temperature,
+        learn_temperature=learn_temperature,
+        only_attention=only_attention,
+        gp_dim=gp_dim,
+        basis=basis,
+        no_cov=no_cov,
+    )
+    gps = nn.ModuleDict({"16": gp16})
+    proj16 = nn.Sequential(nn.Conv2d(1024, 512, 1, 1), nn.BatchNorm2d(512))
+    proj8 = nn.Sequential(nn.Conv2d(512, 512, 1, 1), nn.BatchNorm2d(512))
+    proj4 = nn.Sequential(nn.Conv2d(256, 256, 1, 1), nn.BatchNorm2d(256))
+    proj2 = nn.Sequential(nn.Conv2d(128, 64, 1, 1), nn.BatchNorm2d(64))
+    proj1 = nn.Sequential(nn.Conv2d(64, 9, 1, 1), nn.BatchNorm2d(9))
+    proj = nn.ModuleDict({
+        "16": proj16,
+        "8": proj8,
+        "4": proj4,
+        "2": proj2,
+        "1": proj1,
+        })
+    displacement_dropout_p = 0.0
+    gm_warp_dropout_p = 0.0
+    decoder = Decoder(coordinate_decoder,
+                      gps,
+                      proj,
+                      conv_refiner,
+                      detach=True,
+                      scales=["16", "8", "4", "2", "1"],
+                      displacement_dropout_p = displacement_dropout_p,
+                      gm_warp_dropout_p = gm_warp_dropout_p)
+    h,w = resolutions[resolution]
+    encoder = CNNandDinov2(
+        cnn_kwargs = dict(
+            pretrained=pretrained_backbone,
+            amp = True),
+        amp = True,
+        use_vgg = True,
+    )
+    matcher = RegressionMatcher(encoder, decoder, h=h, w=w, alpha=1, beta=0,**kwargs)
+    return matcher
+def train(args):
+    dist.init_process_group('nccl')
+    #torch._dynamo.config.verbose=True
+    gpus = int(os.environ['WORLD_SIZE'])
+    # create model and move it to GPU with id rank
+    rank = dist.get_rank()
+    print(f"Start running DDP on rank {rank}")
+    device_id = rank % torch.cuda.device_count()
+    romatch.LOCAL_RANK = device_id
+    torch.cuda.set_device(device_id)
+    resolution = args.train_resolution
+    wandb_log = not args.dont_log_wandb
+    experiment_name = os.path.splitext(os.path.basename(__file__))[0]
+    wandb_mode = "online" if wandb_log and rank == 0 and False else "disabled"
+    wandb.init(project="romatch", entity=args.wandb_entity, name=experiment_name, reinit=False, mode = wandb_mode)
+    checkpoint_dir = "workspace/checkpoints/"
+    h,w = resolutions[resolution]
+    model = get_model(pretrained_backbone=True, resolution=resolution, attenuate_cert = False).to(device_id)
+    # Num steps
+    global_step = 0
+    batch_size = args.gpu_batch_size
+    step_size = gpus*batch_size
+    romatch.STEP_SIZE = step_size
+    N = (32 * 250000)  # 250k steps of batch size 32
+    # checkpoint every
+    k = 25000 // romatch.STEP_SIZE
+    # Data
+    mega = MegadepthBuilder(data_root="data/megadepth", loftr_ignore=True, imc21_ignore = True)
+    use_horizontal_flip_aug = True
+    rot_prob = 0
+    depth_interpolation_mode = "bilinear"
+    megadepth_train1 = mega.build_scenes(
+        split="train_loftr", min_overlap=0.01, shake_t=32, use_horizontal_flip_aug = use_horizontal_flip_aug, rot_prob = rot_prob,
+        ht=h,wt=w,
+    )
+    megadepth_train2 = mega.build_scenes(
+        split="train_loftr", min_overlap=0.35, shake_t=32, use_horizontal_flip_aug = use_horizontal_flip_aug, rot_prob = rot_prob,
+        ht=h,wt=w,
+    )
+    megadepth_train = ConcatDataset(megadepth_train1 + megadepth_train2)
+    mega_ws = mega.weight_scenes(megadepth_train, alpha=0.75)
+    scannet = ScanNetBuilder(data_root="data/scannet")
+    scannet_train = scannet.build_scenes(split="train", ht=h, wt=w, use_horizontal_flip_aug = use_horizontal_flip_aug)
+    scannet_train = ConcatDataset(scannet_train)
+    scannet_ws = scannet.weight_scenes(scannet_train, alpha=0.75)
+    # Loss and optimizer
+    depth_loss_scannet = RobustLosses(
+        ce_weight=0.0,
+        local_dist={1:4, 2:4, 4:8, 8:8},
+        local_largest_scale=8,
+        depth_interpolation_mode=depth_interpolation_mode,
+        alpha = 0.5,
+        c = 1e-4,)
+    # Loss and optimizer
+    depth_loss_mega = RobustLosses(
+        ce_weight=0.01,
+        local_dist={1:4, 2:4, 4:8, 8:8},
+        local_largest_scale=8,
+        depth_interpolation_mode=depth_interpolation_mode,
+        alpha = 0.5,
+        c = 1e-4,)
+    parameters = [
+        {"params": model.encoder.parameters(), "lr": romatch.STEP_SIZE * 5e-6 / 8},
+        {"params": model.decoder.parameters(), "lr": romatch.STEP_SIZE * 1e-4 / 8},
+    ]
+    optimizer = torch.optim.AdamW(parameters, weight_decay=0.01)
+    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
+        optimizer, milestones=[(9*N/romatch.STEP_SIZE)//10])
+    megadense_benchmark = MegadepthDenseBenchmark("data/megadepth", num_samples = 1000, h=h,w=w)
+    checkpointer = CheckPoint(checkpoint_dir, experiment_name)
+    model, optimizer, lr_scheduler, global_step = checkpointer.load(model, optimizer, lr_scheduler, global_step)
+    romatch.GLOBAL_STEP = global_step
+    ddp_model = DDP(model, device_ids=[device_id], find_unused_parameters = False, gradient_as_bucket_view=True)
+    grad_scaler = torch.cuda.amp.GradScaler(growth_interval=1_000_000)
+    grad_clip_norm = 0.01
+    for n in range(romatch.GLOBAL_STEP, N, k * romatch.STEP_SIZE):
+        mega_sampler = torch.utils.data.WeightedRandomSampler(
+            mega_ws, num_samples = batch_size * k, replacement=False
+        )
+        mega_dataloader = iter(
+            torch.utils.data.DataLoader(
+                megadepth_train,
+                batch_size = batch_size,
+                sampler = mega_sampler,
+                num_workers = 8,
+            )
+        )
+        scannet_ws_sampler = torch.utils.data.WeightedRandomSampler(
+            scannet_ws, num_samples=batch_size * k, replacement=False
+        )
+        scannet_dataloader = iter(
+            torch.utils.data.DataLoader(
+                scannet_train,
+                batch_size=batch_size,
+                sampler=scannet_ws_sampler,
+                num_workers=gpus * 8,
+            )
+        )
+        for n_k in tqdm(range(n, n + 2 * k, 2),disable = romatch.RANK > 0):
+            train_k_steps(
+                n_k, 1, mega_dataloader, ddp_model, depth_loss_mega, optimizer, lr_scheduler, grad_scaler, grad_clip_norm = grad_clip_norm, progress_bar=False
+            )
+            train_k_steps(
+                n_k + 1, 1, scannet_dataloader, ddp_model, depth_loss_scannet, optimizer, lr_scheduler, grad_scaler, grad_clip_norm = grad_clip_norm, progress_bar=False
+            )
+        checkpointer.save(model, optimizer, lr_scheduler, romatch.GLOBAL_STEP)
+        wandb.log(megadense_benchmark.benchmark(model), step = romatch.GLOBAL_STEP)
+def test_scannet(model, name, resolution, sample_mode):
+    scannet_benchmark = ScanNetBenchmark("data/scannet")
+    scannet_results = scannet_benchmark.benchmark(model)
+    json.dump(scannet_results, open(f"results/scannet_{name}.json", "w"))
+if __name__ == "__main__":
+    import warnings
+    warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
+    warnings.filterwarnings('ignore')#, category=UserWarning)#, message='WARNING batched routines are designed for small sizes.')
+    os.environ["TORCH_CUDNN_V8_API_ENABLED"] = "1" # For BF16 computations
+    os.environ["OMP_NUM_THREADS"] = "16"
+    import romatch
+    parser = ArgumentParser()
+    parser.add_argument("--test", action='store_true')
+    parser.add_argument("--debug_mode", action='store_true')
+    parser.add_argument("--dont_log_wandb", action='store_true')
+    parser.add_argument("--train_resolution", default='medium')
+    parser.add_argument("--gpu_batch_size", default=4, type=int)
+    parser.add_argument("--wandb_entity", required = False)
+    args, _ = parser.parse_known_args()
+    romatch.DEBUG_MODE = args.debug_mode
+    if not args.test:
+        train(args)
+    experiment_name = os.path.splitext(os.path.basename(__file__))[0]
+    checkpoint_dir = "workspace/"
+    checkpoint_name = checkpoint_dir + experiment_name + ".pth"
+    test_resolution = "medium"
+    sample_mode = "threshold_balanced"
+    symmetric = True
+    upsample_preds = False
+    attenuate_cert = True
+    model = get_model(pretrained_backbone=False, resolution = test_resolution, sample_mode = sample_mode, upsample_preds = upsample_preds, symmetric=symmetric, name=experiment_name, attenuate_cert = attenuate_cert)
+    model = model.cuda()
+    states = torch.load(checkpoint_name)
+    model.load_state_dict(states["model"])
+    test_scannet(model, experiment_name, resolution = test_resolution, sample_mode = sample_mode)

submodules/RoMa/experiments/train_roma_outdoor.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import os
+import torch
+from argparse import ArgumentParser
+from torch import nn
+from torch.utils.data import ConcatDataset
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import json
+import wandb
+from romatch.benchmarks import MegadepthDenseBenchmark
+from romatch.datasets.megadepth import MegadepthBuilder
+from romatch.losses.robust_loss import RobustLosses
+from romatch.benchmarks import MegaDepthPoseEstimationBenchmark, MegadepthDenseBenchmark, HpatchesHomogBenchmark
+from romatch.train.train import train_k_steps
+from romatch.models.matcher import *
+from romatch.models.transformer import Block, TransformerDecoder, MemEffAttention
+from romatch.models.encoders import *
+from romatch.checkpointing import CheckPoint
+resolutions = {"low":(448, 448), "medium":(14*8*5, 14*8*5), "high":(14*8*6, 14*8*6)}
+def get_model(pretrained_backbone=True, resolution = "medium", **kwargs):
+    import warnings
+    warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
+    gp_dim = 512
+    feat_dim = 512
+    decoder_dim = gp_dim + feat_dim
+    cls_to_coord_res = 64
+    coordinate_decoder = TransformerDecoder(
+        nn.Sequential(*[Block(decoder_dim, 8, attn_class=MemEffAttention) for _ in range(5)]),
+        decoder_dim,
+        cls_to_coord_res**2 + 1,
+        is_classifier=True,
+        amp = True,
+        pos_enc = False,)
+    dw = True
+    hidden_blocks = 8
+    kernel_size = 5
+    displacement_emb = "linear"
+    disable_local_corr_grad = True
+    conv_refiner = nn.ModuleDict(
+        {
+            "16": ConvRefiner(
+                2 * 512+128+(2*7+1)**2,
+                2 * 512+128+(2*7+1)**2,
+                2 + 1,
+                kernel_size=kernel_size,
+                dw=dw,
+                hidden_blocks=hidden_blocks,
+                displacement_emb=displacement_emb,
+                displacement_emb_dim=128,
+                local_corr_radius = 7,
+                corr_in_other = True,
+                amp = True,
+                disable_local_corr_grad = disable_local_corr_grad,
+                bn_momentum = 0.01,
+            ),
+            "8": ConvRefiner(
+                2 * 512+64+(2*3+1)**2,
+                2 * 512+64+(2*3+1)**2,
+                2 + 1,
+                kernel_size=kernel_size,
+                dw=dw,
+                hidden_blocks=hidden_blocks,
+                displacement_emb=displacement_emb,
+                displacement_emb_dim=64,
+                local_corr_radius = 3,
+                corr_in_other = True,
+                amp = True,
+                disable_local_corr_grad = disable_local_corr_grad,
+                bn_momentum = 0.01,
+            ),
+            "4": ConvRefiner(
+                2 * 256+32+(2*2+1)**2,
+                2 * 256+32+(2*2+1)**2,
+                2 + 1,
+                kernel_size=kernel_size,
+                dw=dw,
+                hidden_blocks=hidden_blocks,
+                displacement_emb=displacement_emb,
+                displacement_emb_dim=32,
+                local_corr_radius = 2,
+                corr_in_other = True,
+                amp = True,
+                disable_local_corr_grad = disable_local_corr_grad,
+                bn_momentum = 0.01,
+            ),
+            "2": ConvRefiner(
+                2 * 64+16,
+                128+16,
+                2 + 1,
+                kernel_size=kernel_size,
+                dw=dw,
+                hidden_blocks=hidden_blocks,
+                displacement_emb=displacement_emb,
+                displacement_emb_dim=16,
+                amp = True,
+                disable_local_corr_grad = disable_local_corr_grad,
+                bn_momentum = 0.01,
+            ),
+            "1": ConvRefiner(
+                2 * 9 + 6,
+                24,
+                2 + 1,
+                kernel_size=kernel_size,
+                dw=dw,
+                hidden_blocks = hidden_blocks,
+                displacement_emb = displacement_emb,
+                displacement_emb_dim = 6,
+                amp = True,
+                disable_local_corr_grad = disable_local_corr_grad,
+                bn_momentum = 0.01,
+            ),
+        }
+    )
+    kernel_temperature = 0.2
+    learn_temperature = False
+    no_cov = True
+    kernel = CosKernel
+    only_attention = False
+    basis = "fourier"
+    gp16 = GP(
+        kernel,
+        T=kernel_temperature,
+        learn_temperature=learn_temperature,
+        only_attention=only_attention,
+        gp_dim=gp_dim,
+        basis=basis,
+        no_cov=no_cov,
+    )
+    gps = nn.ModuleDict({"16": gp16})
+    proj16 = nn.Sequential(nn.Conv2d(1024, 512, 1, 1), nn.BatchNorm2d(512))
+    proj8 = nn.Sequential(nn.Conv2d(512, 512, 1, 1), nn.BatchNorm2d(512))
+    proj4 = nn.Sequential(nn.Conv2d(256, 256, 1, 1), nn.BatchNorm2d(256))
+    proj2 = nn.Sequential(nn.Conv2d(128, 64, 1, 1), nn.BatchNorm2d(64))
+    proj1 = nn.Sequential(nn.Conv2d(64, 9, 1, 1), nn.BatchNorm2d(9))
+    proj = nn.ModuleDict({
+        "16": proj16,
+        "8": proj8,
+        "4": proj4,
+        "2": proj2,
+        "1": proj1,
+        })
+    displacement_dropout_p = 0.0
+    gm_warp_dropout_p = 0.0
+    decoder = Decoder(coordinate_decoder,
+                      gps,
+                      proj,
+                      conv_refiner,
+                      detach=True,
+                      scales=["16", "8", "4", "2", "1"],
+                      displacement_dropout_p = displacement_dropout_p,
+                      gm_warp_dropout_p = gm_warp_dropout_p)
+    h,w = resolutions[resolution]
+    encoder = CNNandDinov2(
+        cnn_kwargs = dict(
+            pretrained=pretrained_backbone,
+            amp = True),
+        amp = True,
+        use_vgg = True,
+    )
+    matcher = RegressionMatcher(encoder, decoder, h=h, w=w,**kwargs)
+    return matcher
+def train(args):
+    dist.init_process_group('nccl')
+    #torch._dynamo.config.verbose=True
+    gpus = int(os.environ['WORLD_SIZE'])
+    # create model and move it to GPU with id rank
+    rank = dist.get_rank()
+    print(f"Start running DDP on rank {rank}")
+    device_id = rank % torch.cuda.device_count()
+    romatch.LOCAL_RANK = device_id
+    torch.cuda.set_device(device_id)
+    resolution = args.train_resolution
+    wandb_log = not args.dont_log_wandb
+    experiment_name = os.path.splitext(os.path.basename(__file__))[0]
+    wandb_mode = "online" if wandb_log and rank == 0 else "disabled"
+    wandb.init(project="romatch", entity=args.wandb_entity, name=experiment_name, reinit=False, mode = wandb_mode)
+    checkpoint_dir = "workspace/checkpoints/"
+    h,w = resolutions[resolution]
+    model = get_model(pretrained_backbone=True, resolution=resolution, attenuate_cert = False).to(device_id)
+    # Num steps
+    global_step = 0
+    batch_size = args.gpu_batch_size
+    step_size = gpus*batch_size
+    romatch.STEP_SIZE = step_size
+    N = (32 * 250000)  # 250k steps of batch size 32
+    # checkpoint every
+    k = 25000 // romatch.STEP_SIZE
+    # Data
+    mega = MegadepthBuilder(data_root="data/megadepth", loftr_ignore=True, imc21_ignore = True)
+    use_horizontal_flip_aug = True
+    rot_prob = 0
+    depth_interpolation_mode = "bilinear"
+    megadepth_train1 = mega.build_scenes(
+        split="train_loftr", min_overlap=0.01, shake_t=32, use_horizontal_flip_aug = use_horizontal_flip_aug, rot_prob = rot_prob,
+        ht=h,wt=w,
+    )
+    megadepth_train2 = mega.build_scenes(
+        split="train_loftr", min_overlap=0.35, shake_t=32, use_horizontal_flip_aug = use_horizontal_flip_aug, rot_prob = rot_prob,
+        ht=h,wt=w,
+    )
+    megadepth_train = ConcatDataset(megadepth_train1 + megadepth_train2)
+    mega_ws = mega.weight_scenes(megadepth_train, alpha=0.75)
+    # Loss and optimizer
+    depth_loss = RobustLosses(
+        ce_weight=0.01,
+        local_dist={1:4, 2:4, 4:8, 8:8},
+        local_largest_scale=8,
+        depth_interpolation_mode=depth_interpolation_mode,
+        alpha = 0.5,
+        c = 1e-4,)
+    parameters = [
+        {"params": model.encoder.parameters(), "lr": romatch.STEP_SIZE * 5e-6 / 8},
+        {"params": model.decoder.parameters(), "lr": romatch.STEP_SIZE * 1e-4 / 8},
+    ]
+    optimizer = torch.optim.AdamW(parameters, weight_decay=0.01)
+    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
+        optimizer, milestones=[(9*N/romatch.STEP_SIZE)//10])
+    megadense_benchmark = MegadepthDenseBenchmark("data/megadepth", num_samples = 1000, h=h,w=w)
+    checkpointer = CheckPoint(checkpoint_dir, experiment_name)
+    model, optimizer, lr_scheduler, global_step = checkpointer.load(model, optimizer, lr_scheduler, global_step)
+    romatch.GLOBAL_STEP = global_step
+    ddp_model = DDP(model, device_ids=[device_id], find_unused_parameters = False, gradient_as_bucket_view=True)
+    grad_scaler = torch.cuda.amp.GradScaler(growth_interval=1_000_000)
+    grad_clip_norm = 0.01
+    for n in range(romatch.GLOBAL_STEP, N, k * romatch.STEP_SIZE):
+        mega_sampler = torch.utils.data.WeightedRandomSampler(
+            mega_ws, num_samples = batch_size * k, replacement=False
+        )
+        mega_dataloader = iter(
+            torch.utils.data.DataLoader(
+                megadepth_train,
+                batch_size = batch_size,
+                sampler = mega_sampler,
+                num_workers = 8,
+            )
+        )
+        train_k_steps(
+            n, k, mega_dataloader, ddp_model, depth_loss, optimizer, lr_scheduler, grad_scaler, grad_clip_norm = grad_clip_norm,
+        )
+        checkpointer.save(model, optimizer, lr_scheduler, romatch.GLOBAL_STEP)
+        wandb.log(megadense_benchmark.benchmark(model), step = romatch.GLOBAL_STEP)
+def test_mega_8_scenes(model, name):
+    mega_8_scenes_benchmark = MegaDepthPoseEstimationBenchmark("data/megadepth",
+                                                scene_names=['mega_8_scenes_0019_0.1_0.3.npz',
+                                                    'mega_8_scenes_0025_0.1_0.3.npz',
+                                                    'mega_8_scenes_0021_0.1_0.3.npz',
+                                                    'mega_8_scenes_0008_0.1_0.3.npz',
+                                                    'mega_8_scenes_0032_0.1_0.3.npz',
+                                                    'mega_8_scenes_1589_0.1_0.3.npz',
+                                                    'mega_8_scenes_0063_0.1_0.3.npz',
+                                                    'mega_8_scenes_0024_0.1_0.3.npz',
+                                                    'mega_8_scenes_0019_0.3_0.5.npz',
+                                                    'mega_8_scenes_0025_0.3_0.5.npz',
+                                                    'mega_8_scenes_0021_0.3_0.5.npz',
+                                                    'mega_8_scenes_0008_0.3_0.5.npz',
+                                                    'mega_8_scenes_0032_0.3_0.5.npz',
+                                                    'mega_8_scenes_1589_0.3_0.5.npz',
+                                                    'mega_8_scenes_0063_0.3_0.5.npz',
+                                                    'mega_8_scenes_0024_0.3_0.5.npz'])
+    mega_8_scenes_results = mega_8_scenes_benchmark.benchmark(model, model_name=name)
+    print(mega_8_scenes_results)
+    json.dump(mega_8_scenes_results, open(f"results/mega_8_scenes_{name}.json", "w"))
+def test_mega1500(model, name):
+    mega1500_benchmark = MegaDepthPoseEstimationBenchmark("data/megadepth")
+    mega1500_results = mega1500_benchmark.benchmark(model, model_name=name)
+    json.dump(mega1500_results, open(f"results/mega1500_{name}.json", "w"))
+def test_mega_dense(model, name):
+    megadense_benchmark = MegadepthDenseBenchmark("data/megadepth", num_samples = 1000)
+    megadense_results = megadense_benchmark.benchmark(model)
+    json.dump(megadense_results, open(f"results/mega_dense_{name}.json", "w"))
+def test_hpatches(model, name):
+    hpatches_benchmark = HpatchesHomogBenchmark("data/hpatches")
+    hpatches_results = hpatches_benchmark.benchmark(model)
+    json.dump(hpatches_results, open(f"results/hpatches_{name}.json", "w"))
+if __name__ == "__main__":
+    os.environ["TORCH_CUDNN_V8_API_ENABLED"] = "1" # For BF16 computations
+    os.environ["OMP_NUM_THREADS"] = "16"
+    torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
+    import romatch
+    parser = ArgumentParser()
+    parser.add_argument("--only_test", action='store_true')
+    parser.add_argument("--debug_mode", action='store_true')
+    parser.add_argument("--dont_log_wandb", action='store_true')
+    parser.add_argument("--train_resolution", default='medium')
+    parser.add_argument("--gpu_batch_size", default=8, type=int)
+    parser.add_argument("--wandb_entity", required = False)
+    args, _ = parser.parse_known_args()
+    romatch.DEBUG_MODE = args.debug_mode
+    if not args.only_test:
+        train(args)

submodules/RoMa/experiments/train_tiny_roma_v1_outdoor.py ADDED Viewed

	@@ -0,0 +1,498 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+import torch
+from argparse import ArgumentParser
+from pathlib import Path
+import math
+import numpy as np
+from torch import nn
+from torch.utils.data import ConcatDataset
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import json
+import wandb
+from PIL import Image
+from torchvision.transforms import ToTensor
+from romatch.benchmarks import MegadepthDenseBenchmark, ScanNetBenchmark
+from romatch.benchmarks import Mega1500PoseLibBenchmark, ScanNetPoselibBenchmark
+from romatch.datasets.megadepth import MegadepthBuilder
+from romatch.losses.robust_loss_tiny_roma import RobustLosses
+from romatch.benchmarks import MegaDepthPoseEstimationBenchmark, MegadepthDenseBenchmark, HpatchesHomogBenchmark
+from romatch.train.train import train_k_steps
+from romatch.checkpointing import CheckPoint
+resolutions = {"low":(448, 448), "medium":(14*8*5, 14*8*5), "high":(14*8*6, 14*8*6), "xfeat": (600,800), "big": (768, 1024)}
+def kde(x, std = 0.1):
+    # use a gaussian kernel to estimate density
+    x = x.half() # Do it in half precision TODO: remove hardcoding
+    scores = (-torch.cdist(x,x)**2/(2*std**2)).exp()
+    density = scores.sum(dim=-1)
+    return density
+class BasicLayer(nn.Module):
+    """
+        Basic Convolutional Layer: Conv2d -> BatchNorm -> ReLU
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, dilation=1, bias=False, relu = True):
+        super().__init__()
+        self.layer = nn.Sequential(
+                                        nn.Conv2d( in_channels, out_channels, kernel_size, padding = padding, stride=stride, dilation=dilation, bias = bias),
+                                        nn.BatchNorm2d(out_channels, affine=False),
+                                        nn.ReLU(inplace = True) if relu else nn.Identity()
+                                    )
+    def forward(self, x):
+        return self.layer(x)
+class XFeatModel(nn.Module):
+    """
+        Implementation of architecture described in
+        "XFeat: Accelerated Features for Lightweight Image Matching, CVPR 2024."
+    """
+    def __init__(self, xfeat = None,
+                 freeze_xfeat = True,
+                 sample_mode = "threshold_balanced",
+                 symmetric = False,
+                 exact_softmax = False):
+        super().__init__()
+        if xfeat is None:
+            xfeat = torch.hub.load('verlab/accelerated_features', 'XFeat', pretrained = True, top_k = 4096).net
+            del xfeat.heatmap_head, xfeat.keypoint_head, xfeat.fine_matcher
+        if freeze_xfeat:
+            xfeat.train(False)
+            self.xfeat = [xfeat]# hide params from ddp
+        else:
+            self.xfeat = nn.ModuleList([xfeat])
+        self.freeze_xfeat = freeze_xfeat
+        match_dim = 256
+        self.coarse_matcher = nn.Sequential(
+            BasicLayer(64+64+2, match_dim,),
+            BasicLayer(match_dim, match_dim,),
+            BasicLayer(match_dim, match_dim,),
+            BasicLayer(match_dim, match_dim,),
+            nn.Conv2d(match_dim, 3, kernel_size=1, bias=True, padding=0))
+        fine_match_dim = 64
+        self.fine_matcher = nn.Sequential(
+            BasicLayer(24+24+2, fine_match_dim,),
+            BasicLayer(fine_match_dim, fine_match_dim,),
+            BasicLayer(fine_match_dim, fine_match_dim,),
+            BasicLayer(fine_match_dim, fine_match_dim,),
+            nn.Conv2d(fine_match_dim, 3, kernel_size=1, bias=True, padding=0),)
+        self.sample_mode = sample_mode
+        self.sample_thresh = 0.2
+        self.symmetric = symmetric
+        self.exact_softmax = exact_softmax
+    @property
+    def device(self):
+        return self.fine_matcher[-1].weight.device
+    def preprocess_tensor(self, x):
+        """ Guarantee that image is divisible by 32 to avoid aliasing artifacts. """
+        H, W = x.shape[-2:]
+        _H, _W = (H//32) * 32, (W//32) * 32
+        rh, rw = H/_H, W/_W
+        x = F.interpolate(x, (_H, _W), mode='bilinear', align_corners=False)
+        return x, rh, rw
+    def forward_single(self, x):
+        with torch.inference_mode(self.freeze_xfeat or not self.training):
+            xfeat = self.xfeat[0]
+            with torch.no_grad():
+                x = x.mean(dim=1, keepdim = True)
+                x = xfeat.norm(x)
+            #main backbone
+            x1 = xfeat.block1(x)
+            x2 = xfeat.block2(x1 + xfeat.skip1(x))
+            x3 = xfeat.block3(x2)
+            x4 = xfeat.block4(x3)
+            x5 = xfeat.block5(x4)
+            x4 = F.interpolate(x4, (x3.shape[-2], x3.shape[-1]), mode='bilinear')
+            x5 = F.interpolate(x5, (x3.shape[-2], x3.shape[-1]), mode='bilinear')
+            feats = xfeat.block_fusion( x3 + x4 + x5 )
+        if self.freeze_xfeat:
+            return x2.clone(), feats.clone()
+        return x2, feats
+    def to_pixel_coordinates(self, coords, H_A, W_A, H_B = None, W_B = None):
+        if coords.shape[-1] == 2:
+            return self._to_pixel_coordinates(coords, H_A, W_A)
+        if isinstance(coords, (list, tuple)):
+            kpts_A, kpts_B = coords[0], coords[1]
+        else:
+            kpts_A, kpts_B = coords[...,:2], coords[...,2:]
+        return self._to_pixel_coordinates(kpts_A, H_A, W_A), self._to_pixel_coordinates(kpts_B, H_B, W_B)
+    def _to_pixel_coordinates(self, coords, H, W):
+        kpts = torch.stack((W/2 * (coords[...,0]+1), H/2 * (coords[...,1]+1)),axis=-1)
+        return kpts
+    def pos_embed(self, corr_volume: torch.Tensor):
+        B, H1, W1, H0, W0 = corr_volume.shape
+        grid = torch.stack(
+                torch.meshgrid(
+                    torch.linspace(-1+1/W1,1-1/W1, W1),
+                    torch.linspace(-1+1/H1,1-1/H1, H1),
+                    indexing = "xy"),
+                dim = -1).float().to(corr_volume).reshape(H1*W1, 2)
+        down = 4
+        if not self.training and not self.exact_softmax:
+            grid_lr = torch.stack(
+                torch.meshgrid(
+                    torch.linspace(-1+down/W1,1-down/W1, W1//down),
+                    torch.linspace(-1+down/H1,1-down/H1, H1//down),
+                    indexing = "xy"),
+                dim = -1).float().to(corr_volume).reshape(H1*W1 //down**2, 2)
+            cv = corr_volume
+            best_match = cv.reshape(B,H1*W1,H0,W0).amax(dim=1) # B, HW, H, W
+            P_lowres = torch.cat((cv[:,::down,::down].reshape(B,H1*W1 // down**2,H0,W0), best_match[:,None]),dim=1).softmax(dim=1)
+            pos_embeddings = torch.einsum('bchw,cd->bdhw', P_lowres[:,:-1], grid_lr)
+            pos_embeddings += P_lowres[:,-1] * grid[best_match].permute(0,3,1,2)
+        else:
+            P = corr_volume.reshape(B,H1*W1,H0,W0).softmax(dim=1) # B, HW, H, W
+            pos_embeddings = torch.einsum('bchw,cd->bdhw', P, grid)
+        return pos_embeddings
+    def visualize_warp(self, warp, certainty, im_A = None, im_B = None,
+                       im_A_path = None, im_B_path = None, symmetric = True, save_path = None, unnormalize = False):
+        device = warp.device
+        H,W2,_ = warp.shape
+        W = W2//2 if symmetric else W2
+        if im_A is None:
+            from PIL import Image
+            im_A, im_B = Image.open(im_A_path).convert("RGB"), Image.open(im_B_path).convert("RGB")
+        if not isinstance(im_A, torch.Tensor):
+            im_A = im_A.resize((W,H))
+            im_B = im_B.resize((W,H))
+            x_B = (torch.tensor(np.array(im_B)) / 255).to(device).permute(2, 0, 1)
+            if symmetric:
+                x_A = (torch.tensor(np.array(im_A)) / 255).to(device).permute(2, 0, 1)
+        else:
+            if symmetric:
+                x_A = im_A
+            x_B = im_B
+        im_A_transfer_rgb = F.grid_sample(
+        x_B[None], warp[:,:W, 2:][None], mode="bilinear", align_corners=False
+        )[0]
+        if symmetric:
+            im_B_transfer_rgb = F.grid_sample(
+            x_A[None], warp[:, W:, :2][None], mode="bilinear", align_corners=False
+            )[0]
+            warp_im = torch.cat((im_A_transfer_rgb,im_B_transfer_rgb),dim=2)
+            white_im = torch.ones((H,2*W),device=device)
+        else:
+            warp_im = im_A_transfer_rgb
+            white_im = torch.ones((H, W), device = device)
+        vis_im = certainty * warp_im + (1 - certainty) * white_im
+        if save_path is not None:
+            from romatch.utils import tensor_to_pil
+            tensor_to_pil(vis_im, unnormalize=unnormalize).save(save_path)
+        return vis_im
+    def corr_volume(self, feat0, feat1):
+        """
+            input:
+                feat0 -> torch.Tensor(B, C, H, W)
+                feat1 -> torch.Tensor(B, C, H, W)
+            return:
+                corr_volume -> torch.Tensor(B, H, W, H, W)
+        """
+        B, C, H0, W0 = feat0.shape
+        B, C, H1, W1 = feat1.shape
+        feat0 = feat0.view(B, C, H0*W0)
+        feat1 = feat1.view(B, C, H1*W1)
+        corr_volume = torch.einsum('bci,bcj->bji', feat0, feat1).reshape(B, H1, W1, H0 , W0)/math.sqrt(C) #16*16*16
+        return corr_volume
+    @torch.inference_mode()
+    def match_from_path(self, im0_path, im1_path):
+        device = self.device
+        im0 = ToTensor()(Image.open(im0_path))[None].to(device)
+        im1 = ToTensor()(Image.open(im1_path))[None].to(device)
+        return self.match(im0, im1, batched = False)
+    @torch.inference_mode()
+    def match(self, im0, im1, *args, batched = True):
+        # stupid
+        if isinstance(im0, (str, Path)):
+            return self.match_from_path(im0, im1)
+        elif isinstance(im0, Image.Image):
+            batched = False
+            device = self.device
+            im0 = ToTensor()(im0)[None].to(device)
+            im1 = ToTensor()(im1)[None].to(device)
+        B,C,H0,W0 = im0.shape
+        B,C,H1,W1 = im1.shape
+        self.train(False)
+        corresps = self.forward({"im_A":im0, "im_B":im1})
+        #return 1,1
+        flow = F.interpolate(
+            corresps[4]["flow"],
+            size = (H0, W0),
+            mode = "bilinear", align_corners = False).permute(0,2,3,1).reshape(B,H0,W0,2)
+        grid = torch.stack(
+            torch.meshgrid(
+                torch.linspace(-1+1/W0,1-1/W0, W0),
+                torch.linspace(-1+1/H0,1-1/H0, H0),
+                indexing = "xy"),
+            dim = -1).float().to(flow.device).expand(B, H0, W0, 2)
+        certainty = F.interpolate(corresps[4]["certainty"], size = (H0,W0), mode = "bilinear", align_corners = False)
+        warp, cert = torch.cat((grid, flow), dim = -1), certainty[:,0].sigmoid()
+        if batched:
+            return warp, cert
+        else:
+            return warp[0], cert[0]
+    def sample(
+        self,
+        matches,
+        certainty,
+        num=10000,
+    ):
+        if "threshold" in self.sample_mode:
+            upper_thresh = self.sample_thresh
+            certainty = certainty.clone()
+            certainty[certainty > upper_thresh] = 1
+        matches, certainty = (
+            matches.reshape(-1, 4),
+            certainty.reshape(-1),
+        )
+        expansion_factor = 4 if "balanced" in self.sample_mode else 1
+        good_samples = torch.multinomial(certainty,
+                          num_samples = min(expansion_factor*num, len(certainty)),
+                          replacement=False)
+        good_matches, good_certainty = matches[good_samples], certainty[good_samples]
+        if "balanced" not in self.sample_mode:
+            return good_matches, good_certainty
+        density = kde(good_matches, std=0.1)
+        p = 1 / (density+1)
+        p[density < 10] = 1e-7 # Basically should have at least 10 perfect neighbours, or around 100 ok ones
+        balanced_samples = torch.multinomial(p,
+                          num_samples = min(num,len(good_certainty)),
+                          replacement=False)
+        return good_matches[balanced_samples], good_certainty[balanced_samples]
+    def forward(self, batch):
+        """
+            input:
+                x -> torch.Tensor(B, C, H, W) grayscale or rgb images
+            return:
+        """
+        im0 = batch["im_A"]
+        im1 = batch["im_B"]
+        corresps = {}
+        im0, rh0, rw0 = self.preprocess_tensor(im0)
+        im1, rh1, rw1 = self.preprocess_tensor(im1)
+        B, C, H0, W0 = im0.shape
+        B, C, H1, W1 = im1.shape
+        to_normalized = torch.tensor((2/W1, 2/H1, 1)).to(im0.device)[None,:,None,None]
+        if im0.shape[-2:] == im1.shape[-2:]:
+            x = torch.cat([im0, im1], dim=0)
+            x = self.forward_single(x)
+            feats_x0_c, feats_x1_c = x[1].chunk(2)
+            feats_x0_f, feats_x1_f = x[0].chunk(2)
+        else:
+            feats_x0_f, feats_x0_c = self.forward_single(im0)
+            feats_x1_f, feats_x1_c = self.forward_single(im1)
+        corr_volume = self.corr_volume(feats_x0_c, feats_x1_c)
+        coarse_warp = self.pos_embed(corr_volume)
+        coarse_matches = torch.cat((coarse_warp, torch.zeros_like(coarse_warp[:,-1:])), dim=1)
+        feats_x1_c_warped = F.grid_sample(feats_x1_c, coarse_matches.permute(0, 2, 3, 1)[...,:2], mode = 'bilinear', align_corners = False)
+        coarse_matches_delta = self.coarse_matcher(torch.cat((feats_x0_c, feats_x1_c_warped, coarse_warp), dim=1))
+        coarse_matches = coarse_matches + coarse_matches_delta * to_normalized
+        corresps[8] = {"flow": coarse_matches[:,:2], "certainty": coarse_matches[:,2:]}
+        coarse_matches_up = F.interpolate(coarse_matches, size = feats_x0_f.shape[-2:], mode = "bilinear", align_corners = False)
+        coarse_matches_up_detach = coarse_matches_up.detach()#note the detach
+        feats_x1_f_warped = F.grid_sample(feats_x1_f, coarse_matches_up_detach.permute(0, 2, 3, 1)[...,:2], mode = 'bilinear', align_corners = False)
+        fine_matches_delta = self.fine_matcher(torch.cat((feats_x0_f, feats_x1_f_warped, coarse_matches_up_detach[:,:2]), dim=1))
+        fine_matches = coarse_matches_up_detach+fine_matches_delta * to_normalized
+        corresps[4] = {"flow": fine_matches[:,:2], "certainty": fine_matches[:,2:]}
+        return corresps
+def train(args):
+    rank = 0
+    gpus = 1
+    device_id = rank % torch.cuda.device_count()
+    romatch.LOCAL_RANK = 0
+    torch.cuda.set_device(device_id)
+    resolution = "big"
+    wandb_log = not args.dont_log_wandb
+    experiment_name = Path(__file__).stem
+    wandb_mode = "online" if wandb_log and rank == 0 else "disabled"
+    wandb.init(project="romatch", entity=args.wandb_entity, name=experiment_name, reinit=False, mode = wandb_mode)
+    checkpoint_dir = "workspace/checkpoints/"
+    h,w = resolutions[resolution]
+    model = XFeatModel(freeze_xfeat = False).to(device_id)
+    # Num steps
+    global_step = 0
+    batch_size = args.gpu_batch_size
+    step_size = gpus*batch_size
+    romatch.STEP_SIZE = step_size
+    N = 2_000_000  # 2M pairs
+    # checkpoint every
+    k = 25000 // romatch.STEP_SIZE
+    # Data
+    mega = MegadepthBuilder(data_root="data/megadepth", loftr_ignore=True, imc21_ignore = True)
+    use_horizontal_flip_aug = True
+    normalize = False # don't imgnet normalize
+    rot_prob = 0
+    depth_interpolation_mode = "bilinear"
+    megadepth_train1 = mega.build_scenes(
+        split="train_loftr", min_overlap=0.01, shake_t=32, use_horizontal_flip_aug = use_horizontal_flip_aug, rot_prob = rot_prob,
+        ht=h,wt=w, normalize = normalize
+    )
+    megadepth_train2 = mega.build_scenes(
+        split="train_loftr", min_overlap=0.35, shake_t=32, use_horizontal_flip_aug = use_horizontal_flip_aug, rot_prob = rot_prob,
+        ht=h,wt=w, normalize = normalize
+    )
+    megadepth_train = ConcatDataset(megadepth_train1 + megadepth_train2)
+    mega_ws = mega.weight_scenes(megadepth_train, alpha=0.75)
+    # Loss and optimizer
+    depth_loss = RobustLosses(
+        ce_weight=0.01,
+        local_dist={4:4},
+        depth_interpolation_mode=depth_interpolation_mode,
+        alpha = {4:0.15, 8:0.15},
+        c = 1e-4,
+        epe_mask_prob_th = 0.001,
+        )
+    parameters = [
+        {"params": model.parameters(), "lr": romatch.STEP_SIZE * 1e-4 / 8},
+    ]
+    optimizer = torch.optim.AdamW(parameters, weight_decay=0.01)
+    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
+        optimizer, milestones=[(9*N/romatch.STEP_SIZE)//10])
+    #megadense_benchmark = MegadepthDenseBenchmark("data/megadepth", num_samples = 1000, h=h,w=w)
+    mega1500_benchmark = Mega1500PoseLibBenchmark("data/megadepth", num_ransac_iter = 1, test_every = 30)
+    checkpointer = CheckPoint(checkpoint_dir, experiment_name)
+    model, optimizer, lr_scheduler, global_step = checkpointer.load(model, optimizer, lr_scheduler, global_step)
+    romatch.GLOBAL_STEP = global_step
+    grad_scaler = torch.cuda.amp.GradScaler(growth_interval=1_000_000)
+    grad_clip_norm = 0.01
+    #megadense_benchmark.benchmark(model)
+    for n in range(romatch.GLOBAL_STEP, N, k * romatch.STEP_SIZE):
+        mega_sampler = torch.utils.data.WeightedRandomSampler(
+            mega_ws, num_samples = batch_size * k, replacement=False
+        )
+        mega_dataloader = iter(
+            torch.utils.data.DataLoader(
+                megadepth_train,
+                batch_size = batch_size,
+                sampler = mega_sampler,
+                num_workers = 8,
+            )
+        )
+        train_k_steps(
+            n, k, mega_dataloader, model, depth_loss, optimizer, lr_scheduler, grad_scaler, grad_clip_norm = grad_clip_norm,
+        )
+        checkpointer.save(model, optimizer, lr_scheduler, romatch.GLOBAL_STEP)
+        wandb.log(mega1500_benchmark.benchmark(model, model_name=experiment_name), step = romatch.GLOBAL_STEP)
+def test_mega_8_scenes(model, name):
+    mega_8_scenes_benchmark = MegaDepthPoseEstimationBenchmark("data/megadepth",
+                                                scene_names=['mega_8_scenes_0019_0.1_0.3.npz',
+                                                    'mega_8_scenes_0025_0.1_0.3.npz',
+                                                    'mega_8_scenes_0021_0.1_0.3.npz',
+                                                    'mega_8_scenes_0008_0.1_0.3.npz',
+                                                    'mega_8_scenes_0032_0.1_0.3.npz',
+                                                    'mega_8_scenes_1589_0.1_0.3.npz',
+                                                    'mega_8_scenes_0063_0.1_0.3.npz',
+                                                    'mega_8_scenes_0024_0.1_0.3.npz',
+                                                    'mega_8_scenes_0019_0.3_0.5.npz',
+                                                    'mega_8_scenes_0025_0.3_0.5.npz',
+                                                    'mega_8_scenes_0021_0.3_0.5.npz',
+                                                    'mega_8_scenes_0008_0.3_0.5.npz',
+                                                    'mega_8_scenes_0032_0.3_0.5.npz',
+                                                    'mega_8_scenes_1589_0.3_0.5.npz',
+                                                    'mega_8_scenes_0063_0.3_0.5.npz',
+                                                    'mega_8_scenes_0024_0.3_0.5.npz'])
+    mega_8_scenes_results = mega_8_scenes_benchmark.benchmark(model, model_name=name)
+    print(mega_8_scenes_results)
+    json.dump(mega_8_scenes_results, open(f"results/mega_8_scenes_{name}.json", "w"))
+def test_mega1500(model, name):
+    mega1500_benchmark = MegaDepthPoseEstimationBenchmark("data/megadepth")
+    mega1500_results = mega1500_benchmark.benchmark(model, model_name=name)
+    json.dump(mega1500_results, open(f"results/mega1500_{name}.json", "w"))
+def test_mega1500_poselib(model, name):
+    mega1500_benchmark = Mega1500PoseLibBenchmark("data/megadepth", num_ransac_iter = 1, test_every = 1)
+    mega1500_results = mega1500_benchmark.benchmark(model, model_name=name)
+    json.dump(mega1500_results, open(f"results/mega1500_poselib_{name}.json", "w"))
+def test_mega_8_scenes_poselib(model, name):
+    mega1500_benchmark = Mega1500PoseLibBenchmark("data/megadepth", num_ransac_iter = 1, test_every = 1,
+                                                  scene_names=['mega_8_scenes_0019_0.1_0.3.npz',
+                                                    'mega_8_scenes_0025_0.1_0.3.npz',
+                                                    'mega_8_scenes_0021_0.1_0.3.npz',
+                                                    'mega_8_scenes_0008_0.1_0.3.npz',
+                                                    'mega_8_scenes_0032_0.1_0.3.npz',
+                                                    'mega_8_scenes_1589_0.1_0.3.npz',
+                                                    'mega_8_scenes_0063_0.1_0.3.npz',
+                                                    'mega_8_scenes_0024_0.1_0.3.npz',
+                                                    'mega_8_scenes_0019_0.3_0.5.npz',
+                                                    'mega_8_scenes_0025_0.3_0.5.npz',
+                                                    'mega_8_scenes_0021_0.3_0.5.npz',
+                                                    'mega_8_scenes_0008_0.3_0.5.npz',
+                                                    'mega_8_scenes_0032_0.3_0.5.npz',
+                                                    'mega_8_scenes_1589_0.3_0.5.npz',
+                                                    'mega_8_scenes_0063_0.3_0.5.npz',
+                                                    'mega_8_scenes_0024_0.3_0.5.npz'])
+    mega1500_results = mega1500_benchmark.benchmark(model, model_name=name)
+    json.dump(mega1500_results, open(f"results/mega_8_scenes_poselib_{name}.json", "w"))
+def test_scannet_poselib(model, name):
+    scannet_benchmark = ScanNetPoselibBenchmark("data/scannet")
+    scannet_results = scannet_benchmark.benchmark(model)
+    json.dump(scannet_results, open(f"results/scannet_{name}.json", "w"))
+def test_scannet(model, name):
+    scannet_benchmark = ScanNetBenchmark("data/scannet")
+    scannet_results = scannet_benchmark.benchmark(model)
+    json.dump(scannet_results, open(f"results/scannet_{name}.json", "w"))
+if __name__ == "__main__":
+    os.environ["TORCH_CUDNN_V8_API_ENABLED"] = "1" # For BF16 computations
+    os.environ["OMP_NUM_THREADS"] = "16"
+    torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
+    import romatch
+    parser = ArgumentParser()
+    parser.add_argument("--only_test", action='store_true')
+    parser.add_argument("--debug_mode", action='store_true')
+    parser.add_argument("--dont_log_wandb", action='store_true')
+    parser.add_argument("--train_resolution", default='medium')
+    parser.add_argument("--gpu_batch_size", default=8, type=int)
+    parser.add_argument("--wandb_entity", required = False)
+    args, _ = parser.parse_known_args()
+    romatch.DEBUG_MODE = args.debug_mode
+    if not args.only_test:
+        train(args)
+    experiment_name = "tiny_roma_v1_outdoor"#Path(__file__).stem
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = XFeatModel(freeze_xfeat=False, exact_softmax=False).to(device)
+    model.load_state_dict(torch.load(f"{experiment_name}.pth"))
+    test_mega1500_poselib(model, experiment_name)

submodules/RoMa/requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch
+einops
+torchvision
+opencv-python
+kornia
+albumentations
+loguru
+tqdm
+matplotlib
+h5py
+wandb
+timm
+poselib
+#xformers # Optional, used for memefficient attention

submodules/RoMa/romatch/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import os
+from .models import roma_outdoor, tiny_roma_v1_outdoor, roma_indoor
+DEBUG_MODE = False
+RANK = int(os.environ.get('RANK', default = 0))
+GLOBAL_STEP = 0
+STEP_SIZE = 1
+LOCAL_RANK = -1

submodules/RoMa/romatch/benchmarks/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .hpatches_sequences_homog_benchmark import HpatchesHomogBenchmark
+from .scannet_benchmark import ScanNetBenchmark
+from .megadepth_pose_estimation_benchmark import MegaDepthPoseEstimationBenchmark
+from .megadepth_dense_benchmark import MegadepthDenseBenchmark
+from .megadepth_pose_estimation_benchmark_poselib import Mega1500PoseLibBenchmark
+#from .scannet_benchmark_poselib import ScanNetPoselibBenchmark

submodules/RoMa/romatch/benchmarks/hpatches_sequences_homog_benchmark.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from PIL import Image
+import numpy as np
+import os
+from tqdm import tqdm
+from romatch.utils import pose_auc
+import cv2
+class HpatchesHomogBenchmark:
+    """Hpatches grid goes from [0,n-1] instead of [0.5,n-0.5]"""
+    def __init__(self, dataset_path) -> None:
+        seqs_dir = "hpatches-sequences-release"
+        self.seqs_path = os.path.join(dataset_path, seqs_dir)
+        self.seq_names = sorted(os.listdir(self.seqs_path))
+        # Ignore seqs is same as LoFTR.
+        self.ignore_seqs = set(
+            [
+                "i_contruction",
+                "i_crownnight",
+                "i_dc",
+                "i_pencils",
+                "i_whitebuilding",
+                "v_artisans",
+                "v_astronautis",
+                "v_talent",
+            ]
+        )
+    def convert_coordinates(self, im_A_coords, im_A_to_im_B, wq, hq, wsup, hsup):
+        offset = 0.5  # Hpatches assumes that the center of the top-left pixel is at [0,0] (I think)
+        im_A_coords = (
+            np.stack(
+                (
+                    wq * (im_A_coords[..., 0] + 1) / 2,
+                    hq * (im_A_coords[..., 1] + 1) / 2,
+                ),
+                axis=-1,
+            )
+            - offset
+        )
+        im_A_to_im_B = (
+            np.stack(
+                (
+                    wsup * (im_A_to_im_B[..., 0] + 1) / 2,
+                    hsup * (im_A_to_im_B[..., 1] + 1) / 2,
+                ),
+                axis=-1,
+            )
+            - offset
+        )
+        return im_A_coords, im_A_to_im_B
+    def benchmark(self, model, model_name = None):
+        n_matches = []
+        homog_dists = []
+        for seq_idx, seq_name in tqdm(
+            enumerate(self.seq_names), total=len(self.seq_names)
+        ):
+            im_A_path = os.path.join(self.seqs_path, seq_name, "1.ppm")
+            im_A = Image.open(im_A_path)
+            w1, h1 = im_A.size
+            for im_idx in range(2, 7):
+                im_B_path = os.path.join(self.seqs_path, seq_name, f"{im_idx}.ppm")
+                im_B = Image.open(im_B_path)
+                w2, h2 = im_B.size
+                H = np.loadtxt(
+                    os.path.join(self.seqs_path, seq_name, "H_1_" + str(im_idx))
+                )
+                dense_matches, dense_certainty = model.match(
+                    im_A_path, im_B_path
+                )
+                good_matches, _ = model.sample(dense_matches, dense_certainty, 5000)
+                pos_a, pos_b = self.convert_coordinates(
+                    good_matches[:, :2], good_matches[:, 2:], w1, h1, w2, h2
+                )
+                try:
+                    H_pred, inliers = cv2.findHomography(
+                        pos_a,
+                        pos_b,
+                        method = cv2.RANSAC,
+                        confidence = 0.99999,
+                        ransacReprojThreshold = 3 * min(w2, h2) / 480,
+                    )
+                except:
+                    H_pred = None
+                if H_pred is None:
+                    H_pred = np.zeros((3, 3))
+                    H_pred[2, 2] = 1.0
+                corners = np.array(
+                    [[0, 0, 1], [0, h1 - 1, 1], [w1 - 1, 0, 1], [w1 - 1, h1 - 1, 1]]
+                )
+                real_warped_corners = np.dot(corners, np.transpose(H))
+                real_warped_corners = (
+                    real_warped_corners[:, :2] / real_warped_corners[:, 2:]
+                )
+                warped_corners = np.dot(corners, np.transpose(H_pred))
+                warped_corners = warped_corners[:, :2] / warped_corners[:, 2:]
+                mean_dist = np.mean(
+                    np.linalg.norm(real_warped_corners - warped_corners, axis=1)
+                ) / (min(w2, h2) / 480.0)
+                homog_dists.append(mean_dist)
+        n_matches = np.array(n_matches)
+        thresholds = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        auc = pose_auc(np.array(homog_dists), thresholds)
+        return {
+            "hpatches_homog_auc_3": auc[2],
+            "hpatches_homog_auc_5": auc[4],
+            "hpatches_homog_auc_10": auc[9],
+        }

submodules/RoMa/romatch/benchmarks/megadepth_dense_benchmark.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import torch
+import numpy as np
+import tqdm
+from romatch.datasets import MegadepthBuilder
+from romatch.utils import warp_kpts
+from torch.utils.data import ConcatDataset
+import romatch
+class MegadepthDenseBenchmark:
+    def __init__(self, data_root="data/megadepth", h = 384, w = 512, num_samples = 2000) -> None:
+        mega = MegadepthBuilder(data_root=data_root)
+        self.dataset = ConcatDataset(
+            mega.build_scenes(split="test_loftr", ht=h, wt=w)
+        )  # fixed resolution of 384,512
+        self.num_samples = num_samples
+    def geometric_dist(self, depth1, depth2, T_1to2, K1, K2, dense_matches):
+        b, h1, w1, d = dense_matches.shape
+        with torch.no_grad():
+            x1 = dense_matches[..., :2].reshape(b, h1 * w1, 2)
+            mask, x2 = warp_kpts(
+                x1.double(),
+                depth1.double(),
+                depth2.double(),
+                T_1to2.double(),
+                K1.double(),
+                K2.double(),
+            )
+            x2 = torch.stack(
+                (w1 * (x2[..., 0] + 1) / 2, h1 * (x2[..., 1] + 1) / 2), dim=-1
+            )
+            prob = mask.float().reshape(b, h1, w1)
+        x2_hat = dense_matches[..., 2:]
+        x2_hat = torch.stack(
+            (w1 * (x2_hat[..., 0] + 1) / 2, h1 * (x2_hat[..., 1] + 1) / 2), dim=-1
+        )
+        gd = (x2_hat - x2.reshape(b, h1, w1, 2)).norm(dim=-1)
+        gd = gd[prob == 1]
+        pck_1 = (gd < 1.0).float().mean()
+        pck_3 = (gd < 3.0).float().mean()
+        pck_5 = (gd < 5.0).float().mean()
+        return gd, pck_1, pck_3, pck_5, prob
+    def benchmark(self, model, batch_size=8):
+        model.train(False)
+        with torch.no_grad():
+            gd_tot = 0.0
+            pck_1_tot = 0.0
+            pck_3_tot = 0.0
+            pck_5_tot = 0.0
+            sampler = torch.utils.data.WeightedRandomSampler(
+                torch.ones(len(self.dataset)), replacement=False, num_samples=self.num_samples
+            )
+            B = batch_size
+            dataloader = torch.utils.data.DataLoader(
+                self.dataset, batch_size=B, num_workers=batch_size, sampler=sampler
+            )
+            for idx, data in tqdm.tqdm(enumerate(dataloader), disable = romatch.RANK > 0):
+                im_A, im_B, depth1, depth2, T_1to2, K1, K2 = (
+                    data["im_A"].cuda(),
+                    data["im_B"].cuda(),
+                    data["im_A_depth"].cuda(),
+                    data["im_B_depth"].cuda(),
+                    data["T_1to2"].cuda(),
+                    data["K1"].cuda(),
+                    data["K2"].cuda(),
+                )
+                matches, certainty = model.match(im_A, im_B, batched=True)
+                gd, pck_1, pck_3, pck_5, prob = self.geometric_dist(
+                    depth1, depth2, T_1to2, K1, K2, matches
+                )
+                if romatch.DEBUG_MODE:
+                    from romatch.utils.utils import tensor_to_pil
+                    import torch.nn.functional as F
+                    path = "vis"
+                    H, W = model.get_output_resolution()
+                    white_im = torch.ones((B,1,H,W),device="cuda")
+                    im_B_transfer_rgb = F.grid_sample(
+                        im_B.cuda(), matches[:,:,:W, 2:], mode="bilinear", align_corners=False
+                    )
+                    warp_im = im_B_transfer_rgb
+                    c_b = certainty[:,None]#(certainty*0.9 + 0.1*torch.ones_like(certainty))[:,None]
+                    vis_im = c_b * warp_im + (1 - c_b) * white_im
+                    for b in range(B):
+                        import os
+                        os.makedirs(f"{path}/{model.name}/{idx}_{b}_{H}_{W}",exist_ok=True)
+                        tensor_to_pil(vis_im[b], unnormalize=True).save(
+                            f"{path}/{model.name}/{idx}_{b}_{H}_{W}/warp.jpg")
+                        tensor_to_pil(im_A[b].cuda(), unnormalize=True).save(
+                            f"{path}/{model.name}/{idx}_{b}_{H}_{W}/im_A.jpg")
+                        tensor_to_pil(im_B[b].cuda(), unnormalize=True).save(
+                            f"{path}/{model.name}/{idx}_{b}_{H}_{W}/im_B.jpg")
+                gd_tot, pck_1_tot, pck_3_tot, pck_5_tot = (
+                    gd_tot + gd.mean(),
+                    pck_1_tot + pck_1,
+                    pck_3_tot + pck_3,
+                    pck_5_tot + pck_5,
+                )
+        return {
+            "epe": gd_tot.item() / len(dataloader),
+            "mega_pck_1": pck_1_tot.item() / len(dataloader),
+            "mega_pck_3": pck_3_tot.item() / len(dataloader),
+            "mega_pck_5": pck_5_tot.item() / len(dataloader),
+        }

submodules/RoMa/romatch/benchmarks/megadepth_pose_estimation_benchmark.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import numpy as np
+import torch
+from romatch.utils import *
+from PIL import Image
+from tqdm import tqdm
+import torch.nn.functional as F
+import romatch
+import kornia.geometry.epipolar as kepi
+class MegaDepthPoseEstimationBenchmark:
+    def __init__(self, data_root="data/megadepth", scene_names = None) -> None:
+        if scene_names is None:
+            self.scene_names = [
+                "0015_0.1_0.3.npz",
+                "0015_0.3_0.5.npz",
+                "0022_0.1_0.3.npz",
+                "0022_0.3_0.5.npz",
+                "0022_0.5_0.7.npz",
+            ]
+        else:
+            self.scene_names = scene_names
+        self.scenes = [
+            np.load(f"{data_root}/{scene}", allow_pickle=True)
+            for scene in self.scene_names
+        ]
+        self.data_root = data_root
+    def benchmark(self, model, model_name = None):
+        with torch.no_grad():
+            data_root = self.data_root
+            tot_e_t, tot_e_R, tot_e_pose = [], [], []
+            thresholds = [5, 10, 20]
+            for scene_ind in range(len(self.scenes)):
+                import os
+                scene_name = os.path.splitext(self.scene_names[scene_ind])[0]
+                scene = self.scenes[scene_ind]
+                pairs = scene["pair_infos"]
+                intrinsics = scene["intrinsics"]
+                poses = scene["poses"]
+                im_paths = scene["image_paths"]
+                pair_inds = range(len(pairs))
+                for pairind in tqdm(pair_inds):
+                    idx1, idx2 = pairs[pairind][0]
+                    K1 = intrinsics[idx1].copy()
+                    T1 = poses[idx1].copy()
+                    R1, t1 = T1[:3, :3], T1[:3, 3]
+                    K2 = intrinsics[idx2].copy()
+                    T2 = poses[idx2].copy()
+                    R2, t2 = T2[:3, :3], T2[:3, 3]
+                    R, t = compute_relative_pose(R1, t1, R2, t2)
+                    T1_to_2 = np.concatenate((R,t[:,None]), axis=-1)
+                    im_A_path = f"{data_root}/{im_paths[idx1]}"
+                    im_B_path = f"{data_root}/{im_paths[idx2]}"
+                    dense_matches, dense_certainty = model.match(
+                        im_A_path, im_B_path, K1.copy(), K2.copy(), T1_to_2.copy()
+                    )
+                    sparse_matches,_ = model.sample(
+                        dense_matches, dense_certainty, 5_000
+                    )
+                    im_A = Image.open(im_A_path)
+                    w1, h1 = im_A.size
+                    im_B = Image.open(im_B_path)
+                    w2, h2 = im_B.size
+                    if True: # Note: we keep this true as it was used in DKM/RoMa papers. There is very little difference compared to setting to False.
+                        scale1 = 1200 / max(w1, h1)
+                        scale2 = 1200 / max(w2, h2)
+                        w1, h1 = scale1 * w1, scale1 * h1
+                        w2, h2 = scale2 * w2, scale2 * h2
+                        K1, K2 = K1.copy(), K2.copy()
+                        K1[:2] = K1[:2] * scale1
+                        K2[:2] = K2[:2] * scale2
+                    kpts1, kpts2 = model.to_pixel_coordinates(sparse_matches, h1, w1, h2, w2)
+                    kpts1, kpts2 = kpts1.cpu().numpy(), kpts2.cpu().numpy()
+                    for _ in range(5):
+                        shuffling = np.random.permutation(np.arange(len(kpts1)))
+                        kpts1 = kpts1[shuffling]
+                        kpts2 = kpts2[shuffling]
+                        try:
+                            threshold = 0.5
+                            norm_threshold = threshold / (np.mean(np.abs(K1[:2, :2])) + np.mean(np.abs(K2[:2, :2])))
+                            R_est, t_est, mask = estimate_pose(
+                                kpts1,
+                                kpts2,
+                                K1,
+                                K2,
+                                norm_threshold,
+                                conf=0.99999,
+                            )
+                            T1_to_2_est = np.concatenate((R_est, t_est), axis=-1)  #
+                            e_t, e_R = compute_pose_error(T1_to_2_est, R, t)
+                            e_pose = max(e_t, e_R)
+                        except Exception as e:
+                            print(repr(e))
+                            e_t, e_R = 90, 90
+                            e_pose = max(e_t, e_R)
+                        tot_e_t.append(e_t)
+                        tot_e_R.append(e_R)
+                        tot_e_pose.append(e_pose)
+            tot_e_pose = np.array(tot_e_pose)
+            auc = pose_auc(tot_e_pose, thresholds)
+            acc_5 = (tot_e_pose < 5).mean()
+            acc_10 = (tot_e_pose < 10).mean()
+            acc_15 = (tot_e_pose < 15).mean()
+            acc_20 = (tot_e_pose < 20).mean()
+            map_5 = acc_5
+            map_10 = np.mean([acc_5, acc_10])
+            map_20 = np.mean([acc_5, acc_10, acc_15, acc_20])
+            print(f"{model_name} auc: {auc}")
+            return {
+                "auc_5": auc[0],
+                "auc_10": auc[1],
+                "auc_20": auc[2],
+                "map_5": map_5,
+                "map_10": map_10,
+                "map_20": map_20,
+            }

submodules/RoMa/romatch/benchmarks/megadepth_pose_estimation_benchmark_poselib.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import numpy as np
+import torch
+from romatch.utils import *
+from PIL import Image
+from tqdm import tqdm
+import torch.nn.functional as F
+import romatch
+import kornia.geometry.epipolar as kepi
+# wrap cause pyposelib is still in dev
+# will add in deps later
+import poselib
+class Mega1500PoseLibBenchmark:
+    def __init__(self, data_root="data/megadepth", scene_names = None, num_ransac_iter = 5, test_every = 1) -> None:
+        if scene_names is None:
+            self.scene_names = [
+                "0015_0.1_0.3.npz",
+                "0015_0.3_0.5.npz",
+                "0022_0.1_0.3.npz",
+                "0022_0.3_0.5.npz",
+                "0022_0.5_0.7.npz",
+            ]
+        else:
+            self.scene_names = scene_names
+        self.scenes = [
+            np.load(f"{data_root}/{scene}", allow_pickle=True)
+            for scene in self.scene_names
+        ]
+        self.data_root = data_root
+        self.num_ransac_iter = num_ransac_iter
+        self.test_every = test_every
+    def benchmark(self, model, model_name = None):
+        with torch.no_grad():
+            data_root = self.data_root
+            tot_e_t, tot_e_R, tot_e_pose = [], [], []
+            thresholds = [5, 10, 20]
+            for scene_ind in range(len(self.scenes)):
+                import os
+                scene_name = os.path.splitext(self.scene_names[scene_ind])[0]
+                scene = self.scenes[scene_ind]
+                pairs = scene["pair_infos"]
+                intrinsics = scene["intrinsics"]
+                poses = scene["poses"]
+                im_paths = scene["image_paths"]
+                pair_inds = range(len(pairs))[::self.test_every]
+                for pairind in (pbar := tqdm(pair_inds, desc = "Current AUC: ?")):
+                    idx1, idx2 = pairs[pairind][0]
+                    K1 = intrinsics[idx1].copy()
+                    T1 = poses[idx1].copy()
+                    R1, t1 = T1[:3, :3], T1[:3, 3]
+                    K2 = intrinsics[idx2].copy()
+                    T2 = poses[idx2].copy()
+                    R2, t2 = T2[:3, :3], T2[:3, 3]
+                    R, t = compute_relative_pose(R1, t1, R2, t2)
+                    T1_to_2 = np.concatenate((R,t[:,None]), axis=-1)
+                    im_A_path = f"{data_root}/{im_paths[idx1]}"
+                    im_B_path = f"{data_root}/{im_paths[idx2]}"
+                    dense_matches, dense_certainty = model.match(
+                        im_A_path, im_B_path, K1.copy(), K2.copy(), T1_to_2.copy()
+                    )
+                    sparse_matches,_ = model.sample(
+                        dense_matches, dense_certainty, 5_000
+                    )
+                    im_A = Image.open(im_A_path)
+                    w1, h1 = im_A.size
+                    im_B = Image.open(im_B_path)
+                    w2, h2 = im_B.size
+                    kpts1, kpts2 = model.to_pixel_coordinates(sparse_matches, h1, w1, h2, w2)
+                    kpts1, kpts2 = kpts1.cpu().numpy(), kpts2.cpu().numpy()
+                    for _ in range(self.num_ransac_iter):
+                        shuffling = np.random.permutation(np.arange(len(kpts1)))
+                        kpts1 = kpts1[shuffling]
+                        kpts2 = kpts2[shuffling]
+                        try:
+                            threshold = 1
+                            camera1 = {'model': 'PINHOLE', 'width': w1, 'height': h1, 'params': K1[[0,1,0,1], [0,1,2,2]]}
+                            camera2 = {'model': 'PINHOLE', 'width': w2, 'height': h2, 'params': K2[[0,1,0,1], [0,1,2,2]]}
+                            relpose, res = poselib.estimate_relative_pose(
+                                kpts1,
+                                kpts2,
+                                camera1,
+                                camera2,
+                                ransac_opt = {"max_reproj_error": 2*threshold, "max_epipolar_error": threshold, "min_inliers": 8, "max_iterations": 10_000},
+                            )
+                            Rt_est  = relpose.Rt
+                            R_est, t_est = Rt_est[:3,:3], Rt_est[:3,3:]
+                            mask = np.array(res['inliers']).astype(np.float32)
+                            T1_to_2_est = np.concatenate((R_est, t_est), axis=-1)  #
+                            e_t, e_R = compute_pose_error(T1_to_2_est, R, t)
+                            e_pose = max(e_t, e_R)
+                        except Exception as e:
+                            print(repr(e))
+                            e_t, e_R = 90, 90
+                            e_pose = max(e_t, e_R)
+                        tot_e_t.append(e_t)
+                        tot_e_R.append(e_R)
+                        tot_e_pose.append(e_pose)
+                        pbar.set_description(f"Current AUC: {pose_auc(tot_e_pose, thresholds)}")
+            tot_e_pose = np.array(tot_e_pose)
+            auc = pose_auc(tot_e_pose, thresholds)
+            acc_5 = (tot_e_pose < 5).mean()
+            acc_10 = (tot_e_pose < 10).mean()
+            acc_15 = (tot_e_pose < 15).mean()
+            acc_20 = (tot_e_pose < 20).mean()
+            map_5 = acc_5
+            map_10 = np.mean([acc_5, acc_10])
+            map_20 = np.mean([acc_5, acc_10, acc_15, acc_20])
+            print(f"{model_name} auc: {auc}")
+            return {
+                "auc_5": auc[0],
+                "auc_10": auc[1],
+                "auc_20": auc[2],
+                "map_5": map_5,
+                "map_10": map_10,
+                "map_20": map_20,
+            }

submodules/RoMa/romatch/benchmarks/scannet_benchmark.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os.path as osp
+import numpy as np
+import torch
+from romatch.utils import *
+from PIL import Image
+from tqdm import tqdm
+class ScanNetBenchmark:
+    def __init__(self, data_root="data/scannet") -> None:
+        self.data_root = data_root
+    def benchmark(self, model, model_name = None):
+        model.train(False)
+        with torch.no_grad():
+            data_root = self.data_root
+            tmp = np.load(osp.join(data_root, "test.npz"))
+            pairs, rel_pose = tmp["name"], tmp["rel_pose"]
+            tot_e_t, tot_e_R, tot_e_pose = [], [], []
+            pair_inds = np.random.choice(
+                range(len(pairs)), size=len(pairs), replace=False
+            )
+            for pairind in tqdm(pair_inds, smoothing=0.9):
+                scene = pairs[pairind]
+                scene_name = f"scene0{scene[0]}_00"
+                im_A_path = osp.join(
+                        self.data_root,
+                        "scans_test",
+                        scene_name,
+                        "color",
+                        f"{scene[2]}.jpg",
+                    )
+                im_A = Image.open(im_A_path)
+                im_B_path = osp.join(
+                        self.data_root,
+                        "scans_test",
+                        scene_name,
+                        "color",
+                        f"{scene[3]}.jpg",
+                    )
+                im_B = Image.open(im_B_path)
+                T_gt = rel_pose[pairind].reshape(3, 4)
+                R, t = T_gt[:3, :3], T_gt[:3, 3]
+                K = np.stack(
+                    [
+                        np.array([float(i) for i in r.split()])
+                        for r in open(
+                            osp.join(
+                                self.data_root,
+                                "scans_test",
+                                scene_name,
+                                "intrinsic",
+                                "intrinsic_color.txt",
+                            ),
+                            "r",
+                        )
+                        .read()
+                        .split("\n")
+                        if r
+                    ]
+                )
+                w1, h1 = im_A.size
+                w2, h2 = im_B.size
+                K1 = K.copy()
+                K2 = K.copy()
+                dense_matches, dense_certainty = model.match(im_A_path, im_B_path)
+                sparse_matches, sparse_certainty = model.sample(
+                    dense_matches, dense_certainty, 5000
+                )
+                scale1 = 480 / min(w1, h1)
+                scale2 = 480 / min(w2, h2)
+                w1, h1 = scale1 * w1, scale1 * h1
+                w2, h2 = scale2 * w2, scale2 * h2
+                K1 = K1 * scale1
+                K2 = K2 * scale2
+                offset = 0.5
+                kpts1 = sparse_matches[:, :2]
+                kpts1 = (
+                    np.stack(
+                        (
+                            w1 * (kpts1[:, 0] + 1) / 2 - offset,
+                            h1 * (kpts1[:, 1] + 1) / 2 - offset,
+                        ),
+                        axis=-1,
+                    )
+                )
+                kpts2 = sparse_matches[:, 2:]
+                kpts2 = (
+                    np.stack(
+                        (
+                            w2 * (kpts2[:, 0] + 1) / 2 - offset,
+                            h2 * (kpts2[:, 1] + 1) / 2 - offset,
+                        ),
+                        axis=-1,
+                    )
+                )
+                for _ in range(5):
+                    shuffling = np.random.permutation(np.arange(len(kpts1)))
+                    kpts1 = kpts1[shuffling]
+                    kpts2 = kpts2[shuffling]
+                    try:
+                        norm_threshold = 0.5 / (
+                        np.mean(np.abs(K1[:2, :2])) + np.mean(np.abs(K2[:2, :2])))
+                        R_est, t_est, mask = estimate_pose(
+                            kpts1,
+                            kpts2,
+                            K1,
+                            K2,
+                            norm_threshold,
+                            conf=0.99999,
+                        )
+                        T1_to_2_est = np.concatenate((R_est, t_est), axis=-1)  #
+                        e_t, e_R = compute_pose_error(T1_to_2_est, R, t)
+                        e_pose = max(e_t, e_R)
+                    except Exception as e:
+                        print(repr(e))
+                        e_t, e_R = 90, 90
+                        e_pose = max(e_t, e_R)
+                    tot_e_t.append(e_t)
+                    tot_e_R.append(e_R)
+                    tot_e_pose.append(e_pose)
+                tot_e_t.append(e_t)
+                tot_e_R.append(e_R)
+                tot_e_pose.append(e_pose)
+            tot_e_pose = np.array(tot_e_pose)
+            thresholds = [5, 10, 20]
+            auc = pose_auc(tot_e_pose, thresholds)
+            acc_5 = (tot_e_pose < 5).mean()
+            acc_10 = (tot_e_pose < 10).mean()
+            acc_15 = (tot_e_pose < 15).mean()
+            acc_20 = (tot_e_pose < 20).mean()
+            map_5 = acc_5
+            map_10 = np.mean([acc_5, acc_10])
+            map_20 = np.mean([acc_5, acc_10, acc_15, acc_20])
+            return {
+                "auc_5": auc[0],
+                "auc_10": auc[1],
+                "auc_20": auc[2],
+                "map_5": map_5,
+                "map_10": map_10,
+                "map_20": map_20,
+            }

submodules/RoMa/romatch/checkpointing/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .checkpoint import CheckPoint

submodules/RoMa/romatch/checkpointing/checkpoint.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+import torch
+from torch.nn.parallel.data_parallel import DataParallel
+from torch.nn.parallel.distributed import DistributedDataParallel
+from loguru import logger
+import gc
+import romatch
+class CheckPoint:
+    def __init__(self, dir=None, name="tmp"):
+        self.name = name
+        self.dir = dir
+        os.makedirs(self.dir, exist_ok=True)
+    def save(
+        self,
+        model,
+        optimizer,
+        lr_scheduler,
+        n,
+        ):
+        if romatch.RANK == 0:
+            assert model is not None
+            if isinstance(model, (DataParallel, DistributedDataParallel)):
+                model = model.module
+            states = {
+                "model": model.state_dict(),
+                "n": n,
+                "optimizer": optimizer.state_dict(),
+                "lr_scheduler": lr_scheduler.state_dict(),
+            }
+            torch.save(states, self.dir + self.name + f"_latest.pth")
+            logger.info(f"Saved states {list(states.keys())}, at step {n}")
+    def load(
+        self,
+        model,
+        optimizer,
+        lr_scheduler,
+        n,
+        ):
+        if os.path.exists(self.dir + self.name + f"_latest.pth") and romatch.RANK == 0:
+            states = torch.load(self.dir + self.name + f"_latest.pth")
+            if "model" in states:
+                model.load_state_dict(states["model"])
+            if "n" in states:
+                n = states["n"] if states["n"] else n
+            if "optimizer" in states:
+                try:
+                    optimizer.load_state_dict(states["optimizer"])
+                except Exception as e:
+                    print(f"Failed to load states for optimizer, with error {e}")
+            if "lr_scheduler" in states:
+                lr_scheduler.load_state_dict(states["lr_scheduler"])
+            print(f"Loaded states {list(states.keys())}, at step {n}")
+            del states
+            gc.collect()
+            torch.cuda.empty_cache()
+        return model, optimizer, lr_scheduler, n