MyNiuuu commited on Jun 30

Commit

7f9fcea

verified ·

1 Parent(s): 4c6f1b7

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +24 -0
LHM/__init__.py +15 -0
LHM/__pycache__/__init__.cpython-310.pyc +0 -0
LHM/__pycache__/launch.cpython-310.pyc +0 -0
LHM/datasets/__init__.py +16 -0
LHM/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
LHM/datasets/__pycache__/cam_utils.cpython-310.pyc +0 -0
LHM/datasets/__pycache__/mixer.cpython-310.pyc +0 -0
LHM/datasets/base.py +70 -0
LHM/datasets/cam_utils.py +205 -0
LHM/datasets/mixer.py +75 -0
LHM/launch.py +37 -0
LHM/losses/__init__.py +20 -0
LHM/losses/ball_loss.py +54 -0
LHM/losses/offset_loss.py +52 -0
LHM/losses/perceptual.py +70 -0
LHM/losses/pixelwise.py +58 -0
LHM/losses/tvloss.py +55 -0
LHM/models/ESRGANer_utils.py +482 -0
LHM/models/__init__.py +24 -0
LHM/models/__pycache__/ESRGANer_utils.cpython-310.pyc +0 -0
LHM/models/__pycache__/__init__.cpython-310.pyc +0 -0
LHM/models/__pycache__/arcface_utils.cpython-310.pyc +0 -0
LHM/models/__pycache__/embedder.cpython-310.pyc +0 -0
LHM/models/__pycache__/modeling_human_lrm.cpython-310.pyc +0 -0
LHM/models/__pycache__/transformer.cpython-310.pyc +0 -0
LHM/models/__pycache__/transformer_dit.cpython-310.pyc +0 -0
LHM/models/__pycache__/utils.cpython-310.pyc +0 -0
LHM/models/arcface_utils.py +360 -0
LHM/models/block.py +124 -0
LHM/models/discriminator.py +120 -0
LHM/models/embedder.py +37 -0
LHM/models/encoders/__init__.py +15 -0
LHM/models/encoders/__pycache__/__init__.cpython-310.pyc +0 -0
LHM/models/encoders/__pycache__/dinov2_fusion_wrapper.cpython-310.pyc +0 -0
LHM/models/encoders/__pycache__/sapiens_warpper.cpython-310.pyc +0 -0
LHM/models/encoders/dino_wrapper.py +68 -0
LHM/models/encoders/dinov2/__init__.py +15 -0
LHM/models/encoders/dinov2/__pycache__/__init__.cpython-310.pyc +0 -0
LHM/models/encoders/dinov2/hub/__init__.py +4 -0
LHM/models/encoders/dinov2/hub/__pycache__/__init__.cpython-310.pyc +0 -0
LHM/models/encoders/dinov2/hub/__pycache__/backbones.cpython-310.pyc +0 -0
LHM/models/encoders/dinov2/hub/__pycache__/utils.cpython-310.pyc +0 -0
LHM/models/encoders/dinov2/hub/backbones.py +166 -0
LHM/models/encoders/dinov2/hub/classifiers.py +268 -0
LHM/models/encoders/dinov2/hub/depth/__init__.py +7 -0
LHM/models/encoders/dinov2/hub/depth/decode_heads.py +747 -0
LHM/models/encoders/dinov2/hub/depth/encoder_decoder.py +351 -0
LHM/models/encoders/dinov2/hub/depth/ops.py +28 -0
LHM/models/encoders/dinov2/hub/depthers.py +246 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,27 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/characters_images/000001.jpg filter=lfs diff=lfs merge=lfs -text
+assets/videos/scene_000000/bkgd_video.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/videos/scene_000000/smplx_video.mp4 filter=lfs diff=lfs merge=lfs -text
+diffsynth/models/__pycache__/sd3_text_encoder.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
+diffsynth/models/__pycache__/sd3_text_encoder.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+diffsynth/models/__pycache__/sd_unet.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+diffsynth/models/__pycache__/sdxl_unet.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
+diffsynth/models/__pycache__/sdxl_unet.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+diffsynth/models/__pycache__/sdxl_unet.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
+diffsynth/models/__pycache__/svd_unet.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
+diffsynth/models/__pycache__/svd_unet.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+diffsynth/models/__pycache__/svd_unet.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
+diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+diffsynth/tokenizer_configs/kolors/tokenizer/vocab.txt filter=lfs diff=lfs merge=lfs -text
+engine/pose_estimation/third-party/ViTPose/figures/Throughput.png filter=lfs diff=lfs merge=lfs -text
+engine/pose_estimation/third-party/ViTPose/mmpose/.mim/demo/resources/demo.mp4 filter=lfs diff=lfs merge=lfs -text
+engine/pose_estimation/third-party/ViTPose/mmpose/.mim/demo/resources/demo_coco.gif filter=lfs diff=lfs merge=lfs -text
+pretrained_models/dense_sample_points/1_40000.ply filter=lfs diff=lfs merge=lfs -text
+pretrained_models/dense_sample_points/1_60000.ply filter=lfs diff=lfs merge=lfs -text
+pretrained_models/dense_sample_points/1_80000.ply filter=lfs diff=lfs merge=lfs -text
+pretrained_models/gagatracker/vgghead/vgg_heads_l.trcd filter=lfs diff=lfs merge=lfs -text
+pretrained_models/huggingface/models--3DAIGC--LHM-1B-HF/blobs/59dc25167d1d72d57fb068445b96e2343ab550b649e9999765200502d03171b9 filter=lfs diff=lfs merge=lfs -text
+pretrained_models/human_model_files/smplx/smplx_uv/smplx_uv.png filter=lfs diff=lfs merge=lfs -text
+pretrained_models/sapiens/pretrained/checkpoints/sapiens_1b/sapiens_1b_epoch_173_torchscript.pt2 filter=lfs diff=lfs merge=lfs -text

LHM/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Empty

LHM/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (176 Bytes). View file

LHM/__pycache__/launch.cpython-310.pyc ADDED Viewed

Binary file (723 Bytes). View file

LHM/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .mixer import MixerDataset

LHM/datasets/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (227 Bytes). View file

LHM/datasets/__pycache__/cam_utils.cpython-310.pyc ADDED Viewed

Binary file (5.46 kB). View file

LHM/datasets/__pycache__/mixer.cpython-310.pyc ADDED Viewed

Binary file (2.06 kB). View file

LHM/datasets/base.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# -*- coding: utf-8 -*-
+# @Organization  : Alibaba XR-Lab
+# @Author        : Peihao Li & Lingteng Qiu & Xiaodong Gu & Qi Zuo
+# @Email         : [email protected]
+# @Time          : 2025-03-10 18:47:56
+# @Function      : dataset base
+import json
+import pdb
+import traceback
+from abc import ABC, abstractmethod
+import numpy as np
+import torch
+from megfile import smart_exists, smart_open, smart_path_join
+from PIL import Image
+class BaseDataset(torch.utils.data.Dataset, ABC):
+    def __init__(self, root_dirs: str, meta_path: str):
+        super().__init__()
+        self.root_dirs = root_dirs
+        self.uids = self._load_uids(meta_path)
+    def __len__(self):
+        return len(self.uids)
+    @abstractmethod
+    def inner_get_item(self, idx):
+        pass
+    def __getitem__(self, idx):
+        try:
+            return self.inner_get_item(idx)
+        except Exception as e:
+            traceback.print_exc()
+            print(f"[DEBUG-DATASET] Error when loading {self.uids[idx]}")
+            # raise e
+            return self.__getitem__((idx + 1) % self.__len__())
+    @staticmethod
+    def _load_uids(meta_path: str):
+        # meta_path is a json file
+        if meta_path == None:
+            uids = []
+        else:
+            with open(meta_path, "r") as f:
+                uids = json.load(f)
+        return uids
+    @staticmethod
+    def _load_rgba_image(file_path, bg_color: float = 1.0):
+        """Load and blend RGBA image to RGB with certain background, 0-1 scaled"""
+        rgba = np.array(Image.open(smart_open(file_path, "rb")))
+        rgba = torch.from_numpy(rgba).float() / 255.0
+        rgba = rgba.permute(2, 0, 1).unsqueeze(0)
+        rgb = rgba[:, :3, :, :] * rgba[:, 3:4, :, :] + bg_color * (
+            1 - rgba[:, 3:, :, :]
+        )
+        # rgba[:, :3, ...] * rgba[:, 3:, ...] + (1 - rgba[:, 3:, ...])
+        return rgb
+    @staticmethod
+    def _locate_datadir(root_dirs, uid, locator: str):
+        for root_dir in root_dirs:
+            datadir = smart_path_join(root_dir, uid, locator)
+            if smart_exists(datadir):
+                return root_dir
+        raise FileNotFoundError(f"Cannot find valid data directory for uid {uid}")

LHM/datasets/cam_utils.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import torch
+"""
+R: (N, 3, 3)
+T: (N, 3)
+E: (N, 4, 4)
+vector: (N, 3)
+"""
+def compose_extrinsic_R_T(R: torch.Tensor, T: torch.Tensor):
+    """
+    Compose the standard form extrinsic matrix from R and T.
+    Batched I/O.
+    """
+    RT = torch.cat((R, T.unsqueeze(-1)), dim=-1)
+    return compose_extrinsic_RT(RT)
+def compose_extrinsic_RT(RT: torch.Tensor):
+    """
+    Compose the standard form extrinsic matrix from RT.
+    Batched I/O.
+    """
+    return torch.cat([
+        RT,
+        torch.tensor([[[0, 0, 0, 1]]], dtype=RT.dtype, device=RT.device).repeat(RT.shape[0], 1, 1)
+        ], dim=1)
+def decompose_extrinsic_R_T(E: torch.Tensor):
+    """
+    Decompose the standard extrinsic matrix into R and T.
+    Batched I/O.
+    """
+    RT = decompose_extrinsic_RT(E)
+    return RT[:, :, :3], RT[:, :, 3]
+def decompose_extrinsic_RT(E: torch.Tensor):
+    """
+    Decompose the standard extrinsic matrix into RT.
+    Batched I/O.
+    """
+    return E[:, :3, :]
+def camera_normalization_objaverse(normed_dist_to_center, poses: torch.Tensor, ret_transform: bool = False):
+    assert normed_dist_to_center is not None
+    pivotal_pose = compose_extrinsic_RT(poses[:1])
+    dist_to_center = pivotal_pose[:, :3, 3].norm(dim=-1, keepdim=True).item() \
+        if normed_dist_to_center == 'auto' else normed_dist_to_center
+    # compute camera norm (new version)
+    canonical_camera_extrinsics = torch.tensor([[
+        [1, 0, 0, 0],
+        [0, 0, -1, -dist_to_center],
+        [0, 1, 0, 0],
+        [0, 0, 0, 1],
+    ]], dtype=torch.float32)
+    pivotal_pose_inv = torch.inverse(pivotal_pose)
+    camera_norm_matrix = torch.bmm(canonical_camera_extrinsics, pivotal_pose_inv)
+    # normalize all views
+    poses = compose_extrinsic_RT(poses)
+    poses = torch.bmm(camera_norm_matrix.repeat(poses.shape[0], 1, 1), poses)
+    poses = decompose_extrinsic_RT(poses)
+    if ret_transform:
+        return poses, camera_norm_matrix.squeeze(dim=0)
+    return poses
+def get_normalized_camera_intrinsics(intrinsics: torch.Tensor):
+    """
+    intrinsics: (N, 3, 2), [[fx, fy], [cx, cy], [width, height]]
+    Return batched fx, fy, cx, cy
+    """
+    fx, fy = intrinsics[:, 0, 0], intrinsics[:, 0, 1]
+    cx, cy = intrinsics[:, 1, 0], intrinsics[:, 1, 1]
+    width, height = intrinsics[:, 2, 0], intrinsics[:, 2, 1]
+    fx, fy = fx / width, fy / height
+    cx, cy = cx / width, cy / height
+    return fx, fy, cx, cy
+def build_camera_principle(RT: torch.Tensor, intrinsics: torch.Tensor):
+    """
+    RT: (N, 3, 4)
+    intrinsics: (N, 3, 2), [[fx, fy], [cx, cy], [width, height]]
+    """
+    fx, fy, cx, cy = get_normalized_camera_intrinsics(intrinsics)
+    return torch.cat([
+        RT.reshape(-1, 12),
+        fx.unsqueeze(-1), fy.unsqueeze(-1), cx.unsqueeze(-1), cy.unsqueeze(-1),
+    ], dim=-1)
+def build_camera_standard(RT: torch.Tensor, intrinsics: torch.Tensor):
+    """
+    RT: (N, 3, 4)
+    intrinsics: (N, 3, 2), [[fx, fy], [cx, cy], [width, height]]
+    """
+    E = compose_extrinsic_RT(RT)
+    fx, fy, cx, cy = get_normalized_camera_intrinsics(intrinsics)
+    I = torch.stack([
+        torch.stack([fx, torch.zeros_like(fx), cx], dim=-1),
+        torch.stack([torch.zeros_like(fy), fy, cy], dim=-1),
+        torch.tensor([[0, 0, 1]], dtype=torch.float32, device=RT.device).repeat(RT.shape[0], 1),
+    ], dim=1)
+    return torch.cat([
+        E.reshape(-1, 16),
+        I.reshape(-1, 9),
+    ], dim=-1)
+def center_looking_at_camera_pose(
+    camera_position: torch.Tensor, look_at: torch.Tensor = None, up_world: torch.Tensor = None,
+    device: torch.device = torch.device('cpu'),
+    ):
+    """
+    camera_position: (M, 3)
+    look_at: (3)
+    up_world: (3)
+    return: (M, 3, 4)
+    """
+    # by default, looking at the origin and world up is pos-z
+    if look_at is None:
+        look_at = torch.tensor([0, 0, 0], dtype=torch.float32, device=device)
+    if up_world is None:
+        up_world = torch.tensor([0, 0, 1], dtype=torch.float32, device=device)
+    look_at = look_at.unsqueeze(0).repeat(camera_position.shape[0], 1)
+    up_world = up_world.unsqueeze(0).repeat(camera_position.shape[0], 1)
+    z_axis = camera_position - look_at
+    z_axis = z_axis / z_axis.norm(dim=-1, keepdim=True)
+    x_axis = torch.cross(up_world, z_axis)
+    x_axis = x_axis / x_axis.norm(dim=-1, keepdim=True)
+    y_axis = torch.cross(z_axis, x_axis)
+    y_axis = y_axis / y_axis.norm(dim=-1, keepdim=True)
+    extrinsics = torch.stack([x_axis, y_axis, z_axis, camera_position], dim=-1)
+    return extrinsics
+def surrounding_views_linspace(n_views: int, radius: float = 2.0, height: float = 0.8, device: torch.device = torch.device('cpu')):
+    """
+    n_views: number of surrounding views
+    radius: camera dist to center
+    height: height of the camera
+    return: (M, 3, 4)
+    """
+    assert n_views > 0
+    assert radius > 0
+    theta = torch.linspace(-torch.pi / 2, 3 * torch.pi / 2, n_views, device=device)
+    projected_radius = math.sqrt(radius ** 2 - height ** 2)
+    x = torch.cos(theta) * projected_radius
+    y = torch.sin(theta) * projected_radius
+    z = torch.full((n_views,), height, device=device)
+    camera_positions = torch.stack([x, y, z], dim=1)
+    extrinsics = center_looking_at_camera_pose(camera_positions, device=device)
+    return extrinsics
+def create_intrinsics(
+    f: float,
+    c: float = None, cx: float = None, cy: float = None,
+    w: float = 1., h: float = 1.,
+    dtype: torch.dtype = torch.float32,
+    device: torch.device = torch.device('cpu'),
+    ):
+    """
+    return: (3, 2)
+    """
+    fx = fy = f
+    if c is not None:
+        assert cx is None and cy is None, "c and cx/cy cannot be used together"
+        cx = cy = c
+    else:
+        assert cx is not None and cy is not None, "cx/cy must be provided when c is not provided"
+    fx, fy, cx, cy, w, h = fx/w, fy/h, cx/w, cy/h, 1., 1.
+    intrinsics = torch.tensor([
+        [fx, fy],
+        [cx, cy],
+        [w, h],
+    ], dtype=dtype, device=device)
+    return intrinsics

LHM/datasets/mixer.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");:
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import pdb
+from functools import partial
+import torch
+__all__ = ["MixerDataset"]
+class MixerDataset(torch.utils.data.Dataset):
+    """Reference"""
+    def __init__(
+        self,
+        split: str,
+        subsets: dict,
+        **dataset_kwargs,
+    ):
+        self.subsets = [
+            self._dataset_fn(subset, split)(
+                use_flame=subset["use_flame"],
+                src_head_size=subset.get("src_head_size", 448),
+                **dataset_kwargs,
+            )
+            for subset in subsets
+        ]
+        self.virtual_lens = [
+            math.ceil(subset_config["sample_rate"] * len(subset_obj))
+            for subset_config, subset_obj in zip(subsets, self.subsets)
+        ]
+    @staticmethod
+    def _dataset_fn(subset_config: dict, split: str):
+        name = subset_config["name"]
+        dataset_cls = None
+        if name == "video_human":
+            from .video_human import VideoHumanDataset
+        else:
+            raise NotImplementedError(f"Dataset {name} not implemented")
+        return partial(
+            dataset_cls,
+            root_dirs=subset_config["root_dirs"],
+            meta_path=subset_config["meta_path"][split],
+        )
+    def __len__(self):
+        return sum(self.virtual_lens)
+    def __getitem__(self, idx):
+        subset_idx = 0
+        virtual_idx = idx
+        while virtual_idx >= self.virtual_lens[subset_idx]:
+            virtual_idx -= self.virtual_lens[subset_idx]
+            subset_idx += 1
+        real_idx = virtual_idx % len(self.subsets[subset_idx])
+        return self.subsets[subset_idx][real_idx]

LHM/launch.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import pdb
+from LHM.runners import REGISTRY_RUNNERS
+def main():
+    parser = argparse.ArgumentParser(description="OpenLRM launcher")
+    parser.add_argument("runner", type=str, help="Runner to launch")
+    args, unknown = parser.parse_known_args()
+    if args.runner not in REGISTRY_RUNNERS:
+        raise ValueError("Runner {} not found".format(args.runner))
+    RunnerClass = REGISTRY_RUNNERS[args.runner]
+    with RunnerClass() as runner:
+        runner.run()
+if __name__ == "__main__":
+    main()

LHM/losses/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .ball_loss import *
+from .offset_loss import *
+from .perceptual import *
+from .pixelwise import *
+from .tvloss import *

LHM/losses/ball_loss.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# -*- coding: utf-8 -*-
+# @Organization  : Alibaba XR-Lab
+# @Author        : Lingteng Qiu
+# @Email         : [email protected]
+# @Time          : 2025-03-10 19:08:35
+# @Function      : ASAP loss
+import pdb
+import torch
+import torch.nn as nn
+__all__ = ["ASAP_Loss", "Heuristic_ASAP_Loss"]
+class ASAP_Loss(nn.Module):
+    def forward(self, scaling, r=1, **params):
+        """where r is the radius of the ball between max-axis and min-axis."""
+        raise NotImplementedError(
+            "ASAP_Loss is not implemented yet in Inference version"
+        )
+class Heuristic_ASAP_Loss(nn.Module):
+    def __init__(self, group_dict, group_body_mapping):
+        super(Heuristic_ASAP_Loss, self).__init__()
+        self.group_dict = group_dict  # register weights fro different body parts
+        self.group_body_mapping = group_body_mapping  # mapping of body parts to group
+    def _heurisitic_loss(self, _ball_loss):
+        _loss = 0.0
+        for key in self.group_dict.keys():
+            key_weights = self.group_dict[key]
+            group_mapping_idx = self.group_body_mapping[key]
+            _loss += key_weights * _ball_loss[:, group_mapping_idx].mean()
+        return _loss
+    def forward(self, scaling, r=5, **params):
+        """where r is the radius of the ball between max-axis and min-axis."""
+        "human motion or rotation is very different in each body parts, for example, the head is more stable than the leg and hand, so we use heuristic_ball_loss"
+        _scale = scaling
+        _scale_min = torch.min(_scale, dim=-1)[0]
+        _scale_max = torch.max(_scale, dim=-1)[0]
+        scale_ratio = _scale_max / (_scale_min + 1e-6)
+        _ball_loss = torch.clamp(scale_ratio, min=r) - r
+        return self._heurisitic_loss(_ball_loss)

LHM/losses/offset_loss.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# -*- coding: utf-8 -*-
+# @Organization  : Alibaba XR-Lab
+# @Author        : Lingteng Qiu
+# @Email         : [email protected]
+# @Time          : 2025-03-10 19:08:56
+# @Function      : ACAP Loss
+import pdb
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+__all__ = ["ACAP_Loss", "Heuristic_ACAP_Loss"]
+class ACAP_Loss(nn.Module):
+    """As close as possibel loss"""
+    def forward(self, offset, d=0.05625, **params):
+        """Empirically, where d is the thresold of distance points leave from 1.8/32 = 0.0562."""
+        offset_loss = torch.clamp(offset.norm(p=2, dim=-1), min=d) - d
+        return offset_loss.mean()
+class Heuristic_ACAP_Loss(nn.Module):
+    """As close as possibel loss"""
+    def __init__(self, group_dict, group_body_mapping):
+        super(Heuristic_ACAP_Loss, self).__init__()
+        self.group_dict = group_dict  # register weights fro different body parts
+        self.group_body_mapping = group_body_mapping  # mapping of body parts to group
+    def _heurisitic_loss(self, _offset_loss):
+        _loss = 0.0
+        for key in self.group_dict.keys():
+            key_weights = self.group_dict[key]
+            group_mapping_idx = self.group_body_mapping[key]
+            _loss += key_weights * _offset_loss[:, group_mapping_idx].mean()
+        return _loss
+    def forward(self, offset, d=0.05625, **params):
+        """Empirically, where d is the thresold of distance points leave from human prior model, 1.8/32 = 0.0562."""
+        "human motion or rotation is very different in each body parts, for example, the head is more stable than the leg and hand, so we use heuristic_ball_loss"
+        _offset_loss = torch.clamp(offset.norm(p=2, dim=-1), min=d) - d
+        return self._heurisitic_loss(_offset_loss)

LHM/losses/perceptual.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+__all__ = ['LPIPSLoss']
+class LPIPSLoss(nn.Module):
+    """
+    Compute LPIPS loss between two images.
+    """
+    def __init__(self, device, prefech: bool = False):
+        super().__init__()
+        self.device = device
+        self.cached_models = {}
+        if prefech:
+            self.prefetch_models()
+    def _get_model(self, model_name: str):
+        if model_name not in self.cached_models:
+            import warnings
+            with warnings.catch_warnings():
+                warnings.filterwarnings('ignore', category=UserWarning)
+                import lpips
+                _model = lpips.LPIPS(net=model_name, eval_mode=True, verbose=False).to(self.device)
+            _model = torch.compile(_model)
+            self.cached_models[model_name] = _model
+        return self.cached_models[model_name]
+    def prefetch_models(self):
+        _model_names = ['alex', 'vgg']
+        for model_name in _model_names:
+            self._get_model(model_name)
+    def forward(self, x, y, is_training: bool = True):
+        """
+        Assume images are 0-1 scaled and channel first.
+        Args:
+            x: [N, M, C, H, W]
+            y: [N, M, C, H, W]
+            is_training: whether to use VGG or AlexNet.
+        Returns:
+            Mean-reduced LPIPS loss across batch.
+        """
+        model_name = 'vgg' if is_training else 'alex'
+        loss_fn = self._get_model(model_name)
+        N, M, C, H, W = x.shape
+        x = x.reshape(N*M, C, H, W)
+        y = y.reshape(N*M, C, H, W)
+        image_loss = loss_fn(x, y, normalize=True).mean(dim=[1, 2, 3])
+        batch_loss = image_loss.reshape(N, M).mean(dim=1)
+        all_loss = batch_loss.mean()
+        return all_loss

LHM/losses/pixelwise.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+__all__ = ['PixelLoss']
+class PixelLoss(nn.Module):
+    """
+    Pixel-wise loss between two images.
+    """
+    def __init__(self, option: str = 'mse'):
+        super().__init__()
+        self.loss_fn = self._build_from_option(option)
+    @staticmethod
+    def _build_from_option(option: str, reduction: str = 'none'):
+        if option == 'mse':
+            return nn.MSELoss(reduction=reduction)
+        elif option == 'l1':
+            return nn.L1Loss(reduction=reduction)
+        else:
+            raise NotImplementedError(f'Unknown pixel loss option: {option}')
+    @torch.compile
+    def forward(self, x, y):
+        """
+        Assume images are channel first.
+        Args:
+            x: [N, M, C, H, W]
+            y: [N, M, C, H, W]
+        Returns:
+            Mean-reduced pixel loss across batch.
+        """
+        N, M, C, H, W = x.shape
+        x = x.reshape(N*M, C, H, W)
+        y = y.reshape(N*M, C, H, W)
+        image_loss = self.loss_fn(x, y).mean(dim=[1, 2, 3])
+        batch_loss = image_loss.reshape(N, M).mean(dim=1)
+        all_loss = batch_loss.mean()
+        return all_loss

LHM/losses/tvloss.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+__all__ = ['TVLoss']
+class TVLoss(nn.Module):
+    """
+    Total variance loss.
+    """
+    def __init__(self):
+        super().__init__()
+    def numel_excluding_first_dim(self, x):
+        return x.numel() // x.shape[0]
+    @torch.compile
+    def forward(self, x):
+        """
+        Assume batched and channel first with inner sizes.
+        Args:
+            x: [N, M, C, H, W]
+        Returns:
+            Mean-reduced TV loss with element-level scaling.
+        """
+        N, M, C, H, W = x.shape
+        x = x.reshape(N*M, C, H, W)
+        diff_i = x[..., 1:, :] - x[..., :-1, :]
+        diff_j = x[..., :, 1:] - x[..., :, :-1]
+        div_i = self.numel_excluding_first_dim(diff_i)
+        div_j = self.numel_excluding_first_dim(diff_j)
+        tv_i = diff_i.pow(2).sum(dim=[1,2,3]) / div_i
+        tv_j = diff_j.pow(2).sum(dim=[1,2,3]) / div_j
+        tv = tv_i + tv_j
+        batch_tv = tv.reshape(N, M).mean(dim=1)
+        all_tv = batch_tv.mean()
+        return all_tv

LHM/models/ESRGANer_utils.py ADDED Viewed

	@@ -0,0 +1,482 @@

+# -*- coding: utf-8 -*-
+# @Organization  : Alibaba XR-Lab
+# @Author        : Lingteng Qiu
+# @Email         : [email protected]
+# @Time          : 2025-03-1 17:39:52
+# @Function      : Function to improve face quality when training.
+import math
+import os
+import queue
+import sys
+sys.path.append("./")
+import threading
+import cv2
+import numpy as np
+import torch
+from basicsr.utils.download_util import load_file_from_url
+from torch.nn import functional as F
+ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+import pdb
+import torch
+from basicsr.archs.rrdbnet_arch import RRDBNet
+def avaliable_device():
+    if torch.cuda.is_available():
+        current_device_id = torch.cuda.current_device()
+        device = f"cuda:{current_device_id}"
+    else:
+        device = "cpu"
+    return device
+class RealESRGANer:
+    """A helper class for upsampling images with RealESRGAN.
+    Args:
+        scale (int): Upsampling scale factor used in the networks. It is usually 2 or 4.
+        model_path (str): The path to the pretrained model. It can be urls (will first download it automatically).
+        model (nn.Module): The defined network. Default: None.
+        tile (int): As too large images result in the out of GPU memory issue, so this tile option will first crop
+            input images into tiles, and then process each of them. Finally, they will be merged into one image.
+            0 denotes for do not use tile. Default: 0.
+        tile_pad (int): The pad size for each tile, to remove border artifacts. Default: 10.
+        pre_pad (int): Pad the input images to avoid border artifacts. Default: 10.
+        half (float): Whether to use half precision during inference. Default: False.
+    """
+    def __init__(
+        self,
+        scale,
+        model_path,
+        dni_weight=None,
+        model=None,
+        tile=0,
+        tile_pad=10,
+        pre_pad=10,
+        half=False,
+        device=None,
+        gpu_id=None,
+    ):
+        self.scale = scale
+        self.tile_size = tile
+        self.tile_pad = tile_pad
+        self.pre_pad = pre_pad
+        self.mod_scale = None
+        self.half = half
+        # initialize model
+        if gpu_id:
+            self.device = (
+                torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
+                if device is None
+                else device
+            )
+        else:
+            self.device = (
+                torch.device("cuda" if torch.cuda.is_available() else "cpu")
+                if device is None
+                else device
+            )
+        if isinstance(model_path, list):
+            # dni
+            assert len(model_path) == len(
+                dni_weight
+            ), "model_path and dni_weight should have the save length."
+            loadnet = self.dni(model_path[0], model_path[1], dni_weight)
+        else:
+            # if the model_path starts with https, it will first download models to the folder: weights
+            if model_path.startswith("https://"):
+                model_path = load_file_from_url(
+                    url=model_path,
+                    model_dir=os.path.join(ROOT_DIR, "weights"),
+                    progress=True,
+                    file_name=None,
+                )
+            loadnet = torch.load(model_path, map_location=torch.device("cpu"))
+        # prefer to use params_ema
+        if "params_ema" in loadnet:
+            keyname = "params_ema"
+        else:
+            keyname = "params"
+        model.load_state_dict(loadnet[keyname], strict=True)
+        model.eval()
+        self.model = model.to(self.device)
+        if self.half:
+            self.model = self.model.half()
+    def dni(self, net_a, net_b, dni_weight, key="params", loc="cpu"):
+        """Deep network interpolation.
+        ``Paper: Deep Network Interpolation for Continuous Imagery Effect Transition``
+        """
+        net_a = torch.load(net_a, map_location=torch.device(loc))
+        net_b = torch.load(net_b, map_location=torch.device(loc))
+        for k, v_a in net_a[key].items():
+            net_a[key][k] = dni_weight[0] * v_a + dni_weight[1] * net_b[key][k]
+        return net_a
+    def pre_process(self, img):
+        """Pre-process, such as pre-pad and mod pad, so that the images can be divisible"""
+        img = torch.from_numpy(np.transpose(img, (2, 0, 1))).float()
+        self.img = img.unsqueeze(0).to(self.device)
+        if self.half:
+            self.img = self.img.half()
+        # pre_pad
+        if self.pre_pad != 0:
+            self.img = F.pad(self.img, (0, self.pre_pad, 0, self.pre_pad), "reflect")
+        # mod pad for divisible borders
+        if self.scale == 2:
+            self.mod_scale = 2
+        elif self.scale == 1:
+            self.mod_scale = 4
+        if self.mod_scale is not None:
+            self.mod_pad_h, self.mod_pad_w = 0, 0
+            _, _, h, w = self.img.size()
+            if h % self.mod_scale != 0:
+                self.mod_pad_h = self.mod_scale - h % self.mod_scale
+            if w % self.mod_scale != 0:
+                self.mod_pad_w = self.mod_scale - w % self.mod_scale
+            self.img = F.pad(
+                self.img, (0, self.mod_pad_w, 0, self.mod_pad_h), "reflect"
+            )
+    def process(self):
+        # model inference
+        self.output = self.model(self.img)
+    def tile_process(self):
+        """It will first crop input images to tiles, and then process each tile.
+        Finally, all the processed tiles are merged into one images.
+        Modified from: https://github.com/ata4/esrgan-launcher
+        """
+        batch, channel, height, width = self.img.shape
+        output_height = height * self.scale
+        output_width = width * self.scale
+        output_shape = (batch, channel, output_height, output_width)
+        # start with black image
+        self.output = self.img.new_zeros(output_shape)
+        tiles_x = math.ceil(width / self.tile_size)
+        tiles_y = math.ceil(height / self.tile_size)
+        # loop over all tiles
+        for y in range(tiles_y):
+            for x in range(tiles_x):
+                # extract tile from input image
+                ofs_x = x * self.tile_size
+                ofs_y = y * self.tile_size
+                # input tile area on total image
+                input_start_x = ofs_x
+                input_end_x = min(ofs_x + self.tile_size, width)
+                input_start_y = ofs_y
+                input_end_y = min(ofs_y + self.tile_size, height)
+                # input tile area on total image with padding
+                input_start_x_pad = max(input_start_x - self.tile_pad, 0)
+                input_end_x_pad = min(input_end_x + self.tile_pad, width)
+                input_start_y_pad = max(input_start_y - self.tile_pad, 0)
+                input_end_y_pad = min(input_end_y + self.tile_pad, height)
+                # input tile dimensions
+                input_tile_width = input_end_x - input_start_x
+                input_tile_height = input_end_y - input_start_y
+                tile_idx = y * tiles_x + x + 1
+                input_tile = self.img[
+                    :,
+                    :,
+                    input_start_y_pad:input_end_y_pad,
+                    input_start_x_pad:input_end_x_pad,
+                ]
+                # upscale tile
+                try:
+                    with torch.no_grad():
+                        output_tile = self.model(input_tile)
+                except RuntimeError as error:
+                    print("Error", error)
+                print(f"\tTile {tile_idx}/{tiles_x * tiles_y}")
+                # output tile area on total image
+                output_start_x = input_start_x * self.scale
+                output_end_x = input_end_x * self.scale
+                output_start_y = input_start_y * self.scale
+                output_end_y = input_end_y * self.scale
+                # output tile area without padding
+                output_start_x_tile = (input_start_x - input_start_x_pad) * self.scale
+                output_end_x_tile = output_start_x_tile + input_tile_width * self.scale
+                output_start_y_tile = (input_start_y - input_start_y_pad) * self.scale
+                output_end_y_tile = output_start_y_tile + input_tile_height * self.scale
+                # put tile into output image
+                self.output[
+                    :, :, output_start_y:output_end_y, output_start_x:output_end_x
+                ] = output_tile[
+                    :,
+                    :,
+                    output_start_y_tile:output_end_y_tile,
+                    output_start_x_tile:output_end_x_tile,
+                ]
+    def post_process(self):
+        # remove extra pad
+        if self.mod_scale is not None:
+            _, _, h, w = self.output.size()
+            self.output = self.output[
+                :,
+                :,
+                0 : h - self.mod_pad_h * self.scale,
+                0 : w - self.mod_pad_w * self.scale,
+            ]
+        # remove prepad
+        if self.pre_pad != 0:
+            _, _, h, w = self.output.size()
+            self.output = self.output[
+                :,
+                :,
+                0 : h - self.pre_pad * self.scale,
+                0 : w - self.pre_pad * self.scale,
+            ]
+        return self.output
+    @torch.no_grad()
+    def enhance(self, img, outscale=None, alpha_upsampler="realesrgan"):
+        h_input, w_input = img.shape[0:2]
+        # img: numpy
+        img = img.astype(np.float32)
+        if np.max(img) > 256:  # 16-bit image
+            max_range = 65535
+            print("\tInput is a 16-bit image")
+        else:
+            max_range = 255
+        img = img / max_range
+        if len(img.shape) == 2:  # gray image
+            img_mode = "L"
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+        elif img.shape[2] == 4:  # RGBA image with alpha channel
+            img_mode = "RGBA"
+            alpha = img[:, :, 3]
+            img = img[:, :, 0:3]
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            if alpha_upsampler == "realesrgan":
+                alpha = cv2.cvtColor(alpha, cv2.COLOR_GRAY2RGB)
+        else:
+            img_mode = "RGB"
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        # ------------------- process image (without the alpha channel) ------------------- #
+        self.pre_process(img)
+        if self.tile_size > 0:
+            self.tile_process()
+        else:
+            self.process()
+        output_img = self.post_process()
+        output_img = output_img.data.squeeze().float().cpu().clamp_(0, 1).numpy()
+        output_img = np.transpose(output_img[[2, 1, 0], :, :], (1, 2, 0))
+        if img_mode == "L":
+            output_img = cv2.cvtColor(output_img, cv2.COLOR_BGR2GRAY)
+        # ------------------- process the alpha channel if necessary ------------------- #
+        if img_mode == "RGBA":
+            if alpha_upsampler == "realesrgan":
+                self.pre_process(alpha)
+                if self.tile_size > 0:
+                    self.tile_process()
+                else:
+                    self.process()
+                output_alpha = self.post_process()
+                output_alpha = (
+                    output_alpha.data.squeeze().float().cpu().clamp_(0, 1).numpy()
+                )
+                output_alpha = np.transpose(output_alpha[[2, 1, 0], :, :], (1, 2, 0))
+                output_alpha = cv2.cvtColor(output_alpha, cv2.COLOR_BGR2GRAY)
+            else:  # use the cv2 resize for alpha channel
+                h, w = alpha.shape[0:2]
+                output_alpha = cv2.resize(
+                    alpha,
+                    (w * self.scale, h * self.scale),
+                    interpolation=cv2.INTER_LINEAR,
+                )
+            # merge the alpha channel
+            output_img = cv2.cvtColor(output_img, cv2.COLOR_BGR2BGRA)
+            output_img[:, :, 3] = output_alpha
+        # ------------------------------ return ------------------------------ #
+        if max_range == 65535:  # 16-bit image
+            output = (output_img * 65535.0).round().astype(np.uint16)
+        else:
+            output = (output_img * 255.0).round().astype(np.uint8)
+        if outscale is not None and outscale != float(self.scale):
+            output = cv2.resize(
+                output,
+                (
+                    int(w_input * outscale),
+                    int(h_input * outscale),
+                ),
+                interpolation=cv2.INTER_LANCZOS4,
+            )
+        return output, img_mode
+class PrefetchReader(threading.Thread):
+    """Prefetch images.
+    Args:
+        img_list (list[str]): A image list of image paths to be read.
+        num_prefetch_queue (int): Number of prefetch queue.
+    """
+    def __init__(self, img_list, num_prefetch_queue):
+        super().__init__()
+        self.que = queue.Queue(num_prefetch_queue)
+        self.img_list = img_list
+    def run(self):
+        for img_path in self.img_list:
+            img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)
+            self.que.put(img)
+        self.que.put(None)
+    def __next__(self):
+        next_item = self.que.get()
+        if next_item is None:
+            raise StopIteration
+        return next_item
+    def __iter__(self):
+        return self
+class IOConsumer(threading.Thread):
+    def __init__(self, opt, que, qid):
+        super().__init__()
+        self._queue = que
+        self.qid = qid
+        self.opt = opt
+    def run(self):
+        while True:
+            msg = self._queue.get()
+            if isinstance(msg, str) and msg == "quit":
+                break
+            output = msg["output"]
+            save_path = msg["save_path"]
+            cv2.imwrite(save_path, output)
+        print(f"IO worker {self.qid} is done.")
+class ESRGANEasyModel:
+    def __init__(
+        self, model_path="./pretrained_models/RealESRGAN_x4plus.pth", face_enhance=True
+    ):
+        model = RRDBNet(
+            num_in_ch=3,
+            num_out_ch=3,
+            num_feat=64,
+            num_block=23,
+            num_grow_ch=32,
+            scale=4,
+        )
+        self.net_scale = 4
+        file_url = [
+            "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth"
+        ]
+        if model_path is None:
+            model_path = os.path.join("weights", args.model_name + ".pth")
+            if not os.path.isfile(model_path):
+                ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+                for url in file_url:
+                    # model_path will be updated
+                    model_path = load_file_from_url(
+                        url=url,
+                        model_dir=os.path.join("./", "pretrained_models"),
+                        progress=True,
+                        file_name=None,
+                    )
+        self.face_enhance = face_enhance
+        dni_weight = None
+        self.upsampler = RealESRGANer(
+            scale=self.net_scale,
+            model_path=model_path,
+            dni_weight=dni_weight,
+            model=model,
+            tile=0,
+            tile_pad=10,
+            pre_pad=0,
+            half=False,
+        )
+        self.upsampler.model.to(avaliable_device())
+        if face_enhance:  # Use GFPGAN for face enhancement
+            from gfpgan import GFPGANer
+            self.face_enhancer = GFPGANer(
+                model_path="https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth",
+                upscale=4,
+                arch="clean",
+                channel_multiplier=2,
+                bg_upsampler=self.upsampler,
+            )
+        else:
+            self.face_enhancer = None
+    @torch.no_grad()
+    def __call__(self, img):
+        if self.face_enhancer is not None:
+            _, _, output = self.face_enhancer.enhance(
+                img, has_aligned=False, only_center_face=False, paste_back=True
+            )
+        else:
+            output, _ = self.upsampler.enhance(img, outscale=4)
+        return output
+    def __repr__(self):
+        return f"ESRGANEasyModel:\n {self.upsampler}"
+if __name__ == "__main__":
+    import time
+    model = ESRGANEasyModel(face_enhance=True)
+    input_img = "./debug/face_debug/gt/head_gt_0.png"
+    img_np = cv2.imread(input_img)
+    set1 = [
+        "./debug/face_debug/gt/head_gt_0.png",
+        "./debug/face_debug/gt/head_gt_1.png",
+        "./debug/face_debug/gt/head_gt_2.png",
+        "./debug/face_debug/gt/head_gt_3.png",
+        "./debug/face_debug/gt/head_gt_4.png",
+        "./debug/face_debug/gt/head_gt_5.png",
+        "./debug/face_debug/gt/head_gt_6.png",
+        "./debug/face_debug/gt/head_gt_0.png",
+    ]
+    img_set1 = [cv2.imread(img_path) for img_path in set1]
+    sr = model(img_set1[0])
+    s0 = time.time()
+    for img in img_set1:
+        sr = model(img)

LHM/models/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .modeling_human_lrm import (
+    ModelHumanLRM,
+    ModelHumanLRMSapdinoBodyHeadSD3_5,
+)
+model_dict = {
+    "human_lrm": ModelHumanLRM,
+    "human_lrm_sapdino_bh_sd3_5": ModelHumanLRMSapdinoBodyHeadSD3_5,
+}

LHM/models/__pycache__/ESRGANer_utils.cpython-310.pyc ADDED Viewed

Binary file (11.9 kB). View file

LHM/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (352 Bytes). View file

LHM/models/__pycache__/arcface_utils.cpython-310.pyc ADDED Viewed

Binary file (9.73 kB). View file

LHM/models/__pycache__/embedder.cpython-310.pyc ADDED Viewed

Binary file (1.03 kB). View file

LHM/models/__pycache__/modeling_human_lrm.cpython-310.pyc ADDED Viewed

Binary file (21.7 kB). View file

LHM/models/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (6.89 kB). View file

LHM/models/__pycache__/transformer_dit.cpython-310.pyc ADDED Viewed

Binary file (15 kB). View file

LHM/models/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (1.44 kB). View file

LHM/models/arcface_utils.py ADDED Viewed

	@@ -0,0 +1,360 @@

+# -*- coding: utf-8 -*-
+# @Organization  : Alibaba XR-Lab
+# @Author        : Lingteng Qiu
+# @Email         : [email protected]
+# @Time          : 2025-03-10 17:38:29
+# @Function      : Arc-Similarity Loss
+import sys
+sys.path.append(".")
+import pdb
+from copy import deepcopy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+def conv3x3(inplanes, outplanes, stride=1):
+    """A simple wrapper for 3x3 convolution with padding.
+    Args:
+        inplanes (int): Channel number of inputs.
+        outplanes (int): Channel number of outputs.
+        stride (int): Stride in convolution. Default: 1.
+    """
+    return nn.Conv2d(
+        inplanes, outplanes, kernel_size=3, stride=stride, padding=1, bias=False
+    )
+class BasicBlock(nn.Module):
+    """Basic residual block used in the ResNetArcFace architecture.
+    Args:
+        inplanes (int): Channel number of inputs.
+        planes (int): Channel number of outputs.
+        stride (int): Stride in convolution. Default: 1.
+        downsample (nn.Module): The downsample module. Default: None.
+    """
+    expansion = 1  # output channel expansion ratio
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class IRBlock(nn.Module):
+    """Improved residual block (IR Block) used in the ResNetArcFace architecture.
+    Args:
+        inplanes (int): Channel number of inputs.
+        planes (int): Channel number of outputs.
+        stride (int): Stride in convolution. Default: 1.
+        downsample (nn.Module): The downsample module. Default: None.
+        use_se (bool): Whether use the SEBlock (squeeze and excitation block). Default: True.
+    """
+    expansion = 1  # output channel expansion ratio
+    def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True):
+        super(IRBlock, self).__init__()
+        self.bn0 = nn.BatchNorm2d(inplanes)
+        self.conv1 = conv3x3(inplanes, inplanes)
+        self.bn1 = nn.BatchNorm2d(inplanes)
+        self.prelu = nn.PReLU()
+        self.conv2 = conv3x3(inplanes, planes, stride)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+        self.use_se = use_se
+        if self.use_se:
+            self.se = SEBlock(planes)
+    def forward(self, x):
+        residual = x
+        out = self.bn0(x)
+        out = self.conv1(out)
+        out = self.bn1(out)
+        out = self.prelu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.use_se:
+            out = self.se(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.prelu(out)
+        return out
+class Bottleneck(nn.Module):
+    """Bottleneck block used in the ResNetArcFace architecture.
+    Args:
+        inplanes (int): Channel number of inputs.
+        planes (int): Channel number of outputs.
+        stride (int): Stride in convolution. Default: 1.
+        downsample (nn.Module): The downsample module. Default: None.
+    """
+    expansion = 4  # output channel expansion ratio
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, stride=stride, padding=1, bias=False
+        )
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(
+            planes, planes * self.expansion, kernel_size=1, bias=False
+        )
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class SEBlock(nn.Module):
+    """The squeeze-and-excitation block (SEBlock) used in the IRBlock.
+    Args:
+        channel (int): Channel number of inputs.
+        reduction (int): Channel reduction ration. Default: 16.
+    """
+    def __init__(self, channel, reduction=16):
+        super(SEBlock, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(
+            1
+        )  # pool to 1x1 without spatial information
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction),
+            nn.PReLU(),
+            nn.Linear(channel // reduction, channel),
+            nn.Sigmoid(),
+        )
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+class ResNetArcFace(nn.Module):
+    """ArcFace with ResNet architectures.
+    Ref: ArcFace: Additive Angular Margin Loss for Deep Face Recognition.
+    Args:
+        block (str): Block used in the ArcFace architecture.
+        layers (tuple(int)): Block numbers in each layer.
+        use_se (bool): Whether use the SEBlock (squeeze and excitation block). Default: True.
+    """
+    def __init__(
+        self,
+        block="IRBlock",
+        layers=[2, 2, 2, 2],
+        use_se=False,
+        pretrain_model="./pretrained_models/arcface_resnet18.pth",
+    ):
+        if block == "IRBlock":
+            block = IRBlock
+        self.inplanes = 64
+        self.use_se = use_se
+        super(ResNetArcFace, self).__init__()
+        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.prelu = nn.PReLU()
+        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.bn4 = nn.BatchNorm2d(512)
+        self.dropout = nn.Dropout()
+        self.fc5 = nn.Linear(512 * 8 * 8, 512)
+        self.bn5 = nn.BatchNorm1d(512)
+        # initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.xavier_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.xavier_normal_(m.weight)
+                nn.init.constant_(m.bias, 0)
+        if pretrain_model is not None:
+            self.load_network(self, pretrain_model, strict=True, param_key=None)
+        else:
+            raise ValueError("Please specify the pretrain model path.")
+        self.freeze()
+    @staticmethod
+    def load_network(net, load_path, strict=True, param_key=None):
+        def get_bare_model(net):
+            if isinstance(net, (DataParallel, DistributedDataParallel)):
+                net = net.module
+            return net
+        net = get_bare_model(net)
+        load_net = torch.load(load_path, map_location=lambda storage, loc: storage)
+        if param_key is not None:
+            if param_key not in load_net and "params" in load_net:
+                param_key = "params"
+            load_net = load_net[param_key]
+        # remove unnecessary 'module.'
+        for k, v in deepcopy(load_net).items():
+            if k.startswith("module."):
+                load_net[k[7:]] = v
+                load_net.pop(k)
+        ret = net.load_state_dict(load_net, strict=strict)
+        print(ret)
+    def _make_layer(self, block, planes, num_blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, use_se=self.use_se)
+        )
+        self.inplanes = planes
+        for _ in range(1, num_blocks):
+            layers.append(block(self.inplanes, planes, use_se=self.use_se))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.prelu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.bn4(x)
+        x = self.dropout(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc5(x)
+        x = self.bn5(x)
+        return x
+    def freeze(self):
+        self.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+if __name__ == "__main__":
+    model = ResNetArcFace()
+    model.cuda()
+    model.eval()
+    # model.eval()
+    set1 = [
+        "./debug/face_debug/gt/head_gt_0.png",
+        "./debug/face_debug/gt/head_gt_1.png",
+        "./debug/face_debug/gt/head_gt_2.png",
+        "./debug/face_debug/gt/head_gt_3.png",
+        "./debug/face_debug/gt/head_gt_4.png",
+        "./debug/face_debug/gt/head_gt_5.png",
+        "./debug/face_debug/gt/head_gt_6.png",
+    ]
+    import cv2
+    img_set1 = [cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) for img_path in set1]
+    F1_list = []
+    f1_scores = []
+    for img in img_set1:
+        img = torch.from_numpy(img).unsqueeze(0).unsqueeze(0) / 255.0
+        img = img.cuda()
+        F1 = model(img)
+        F1_list.append(F1)
+    for i in range(len(F1_list)):
+        for j in range(len(F1_list)):
+            f1_scores.append(F.l1_loss(F1_list[i], F1_list[j]))
+    print(len(f1_scores))
+    f1_scores = torch.tensor(f1_scores)
+    print(f1_scores)
+    f1_scores = f1_scores.view(len(F1_list), len(F1_list))
+    print(f1_scores)

LHM/models/block.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+from .modulate import ModLN
+class BasicBlock(nn.Module):
+    """
+    Transformer block that is in its simplest form.
+    Designed for PF-LRM architecture.
+    """
+    # Block contains a self-attention layer and an MLP
+    def __init__(self, inner_dim: int, num_heads: int, eps: float,
+                 attn_drop: float = 0., attn_bias: bool = False,
+                 mlp_ratio: float = 4., mlp_drop: float = 0.):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(inner_dim, eps=eps)
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=inner_dim, num_heads=num_heads,
+            dropout=attn_drop, bias=attn_bias, batch_first=True)
+        self.norm2 = nn.LayerNorm(inner_dim, eps=eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(inner_dim, int(inner_dim * mlp_ratio)),
+            nn.GELU(),
+            nn.Dropout(mlp_drop),
+            nn.Linear(int(inner_dim * mlp_ratio), inner_dim),
+            nn.Dropout(mlp_drop),
+        )
+    def forward(self, x):
+        # x: [N, L, D]
+        before_sa = self.norm1(x)
+        x = x + self.self_attn(before_sa, before_sa, before_sa, need_weights=False)[0]
+        x = x + self.mlp(self.norm2(x))
+        return x
+class ConditionBlock(nn.Module):
+    """
+    Transformer block that takes in a cross-attention condition.
+    Designed for SparseLRM architecture.
+    """
+    # Block contains a cross-attention layer, a self-attention layer, and an MLP
+    def __init__(self, inner_dim: int, cond_dim: int, num_heads: int, eps: float,
+                 attn_drop: float = 0., attn_bias: bool = False,
+                 mlp_ratio: float = 4., mlp_drop: float = 0.):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(inner_dim, eps=eps)
+        self.cross_attn = nn.MultiheadAttention(
+            embed_dim=inner_dim, num_heads=num_heads, kdim=cond_dim, vdim=cond_dim,
+            dropout=attn_drop, bias=attn_bias, batch_first=True)
+        self.norm2 = nn.LayerNorm(inner_dim, eps=eps)
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=inner_dim, num_heads=num_heads,
+            dropout=attn_drop, bias=attn_bias, batch_first=True)
+        self.norm3 = nn.LayerNorm(inner_dim, eps=eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(inner_dim, int(inner_dim * mlp_ratio)),
+            nn.GELU(),
+            nn.Dropout(mlp_drop),
+            nn.Linear(int(inner_dim * mlp_ratio), inner_dim),
+            nn.Dropout(mlp_drop),
+        )
+    def forward(self, x, cond):
+        # x: [N, L, D]
+        # cond: [N, L_cond, D_cond]
+        x = x + self.cross_attn(self.norm1(x), cond, cond, need_weights=False)[0]
+        before_sa = self.norm2(x)
+        x = x + self.self_attn(before_sa, before_sa, before_sa, need_weights=False)[0]
+        x = x + self.mlp(self.norm3(x))
+        return x
+class ConditionModulationBlock(nn.Module):
+    """
+    Transformer block that takes in a cross-attention condition and another modulation vector applied to sub-blocks.
+    Designed for raw LRM architecture.
+    """
+    # Block contains a cross-attention layer, a self-attention layer, and an MLP
+    def __init__(self, inner_dim: int, cond_dim: int, mod_dim: int, num_heads: int, eps: float,
+                 attn_drop: float = 0., attn_bias: bool = False,
+                 mlp_ratio: float = 4., mlp_drop: float = 0.):
+        super().__init__()
+        self.norm1 = ModLN(inner_dim, mod_dim, eps)
+        self.cross_attn = nn.MultiheadAttention(
+            embed_dim=inner_dim, num_heads=num_heads, kdim=cond_dim, vdim=cond_dim,
+            dropout=attn_drop, bias=attn_bias, batch_first=True)
+        self.norm2 = ModLN(inner_dim, mod_dim, eps)
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=inner_dim, num_heads=num_heads,
+            dropout=attn_drop, bias=attn_bias, batch_first=True)
+        self.norm3 = ModLN(inner_dim, mod_dim, eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(inner_dim, int(inner_dim * mlp_ratio)),
+            nn.GELU(),
+            nn.Dropout(mlp_drop),
+            nn.Linear(int(inner_dim * mlp_ratio), inner_dim),
+            nn.Dropout(mlp_drop),
+        )
+    def forward(self, x, cond, mod):
+        # x: [N, L, D]
+        # cond: [N, L_cond, D_cond]
+        # mod: [N, D_mod]
+        x = x + self.cross_attn(self.norm1(x, mod), cond, cond, need_weights=False)[0]
+        before_sa = self.norm2(x, mod)
+        x = x + self.self_attn(before_sa, before_sa, before_sa, need_weights=False)[0]
+        x = x + self.mlp(self.norm3(x, mod))
+        return x

LHM/models/discriminator.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""
+Ported from Paella
+"""
+import torch
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+import functools
+# import torch.nn as nn
+from taming.modules.util import ActNorm
+# Discriminator model ported from Paella https://github.com/dome272/Paella/blob/main/src_distributed/vqgan.py
+class Discriminator(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self, in_channels=3, cond_channels=0, hidden_channels=512, depth=6):
+        super().__init__()
+        d = max(depth - 3, 3)
+        layers = [
+            nn.utils.spectral_norm(
+                nn.Conv2d(in_channels, hidden_channels // (2**d), kernel_size=3, stride=2, padding=1)
+            ),
+            nn.LeakyReLU(0.2),
+        ]
+        for i in range(depth - 1):
+            c_in = hidden_channels // (2 ** max((d - i), 0))
+            c_out = hidden_channels // (2 ** max((d - 1 - i), 0))
+            layers.append(nn.utils.spectral_norm(nn.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
+            layers.append(nn.InstanceNorm2d(c_out))
+            layers.append(nn.LeakyReLU(0.2))
+        self.encoder = nn.Sequential(*layers)
+        self.shuffle = nn.Conv2d(
+            (hidden_channels + cond_channels) if cond_channels > 0 else hidden_channels, 1, kernel_size=1
+        )
+        # self.logits = nn.Sigmoid()
+    def forward(self, x, cond=None):
+        x = self.encoder(x)
+        if cond is not None:
+            cond = cond.view(
+                cond.size(0),
+                cond.size(1),
+                1,
+                1,
+            ).expand(-1, -1, x.size(-2), x.size(-1))
+            x = torch.cat([x, cond], dim=1)
+        x = self.shuffle(x)
+        # x = self.logits(x)
+        return x
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        nn.init.normal_(m.weight.data, 0.0, 0.02)
+    elif classname.find('BatchNorm') != -1:
+        nn.init.normal_(m.weight.data, 1.0, 0.02)
+        nn.init.constant_(m.bias.data, 0)
+class NLayerDiscriminator(nn.Module):
+    """Defines a PatchGAN discriminator as in Pix2Pix
+        --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+    """
+    def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
+        """Construct a PatchGAN discriminator
+        Parameters:
+            input_nc (int)  -- the number of channels in input images
+            ndf (int)       -- the number of filters in the last conv layer
+            n_layers (int)  -- the number of conv layers in the discriminator
+            norm_layer      -- normalization layer
+        """
+        super(NLayerDiscriminator, self).__init__()
+        if not use_actnorm:
+            # norm_layer = nn.BatchNorm2d
+            norm_layer = nn.InstanceNorm2d
+        else:
+            norm_layer = ActNorm
+        if type(norm_layer) == functools.partial:  # no need to use bias as BatchNorm2d has affine parameters
+            # use_bias = norm_layer.func != nn.BatchNorm2d
+            use_bias = norm_layer.func != nn.InstanceNorm2d
+        else:
+            # use_bias = norm_layer != nn.BatchNorm2d
+            use_bias = norm_layer != nn.InstanceNorm2d
+        kw = 4
+        padw = 1
+        sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, False)]
+        nf_mult = 1
+        nf_mult_prev = 1
+        for n in range(1, n_layers):  # gradually increase the number of filters
+            nf_mult_prev = nf_mult
+            nf_mult = min(2 ** n, 8)
+            sequence += [
+                nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias),
+                norm_layer(ndf * nf_mult),
+                nn.LeakyReLU(0.2, False)
+            ]
+        nf_mult_prev = nf_mult
+        nf_mult = min(2 ** n_layers, 8)
+        sequence += [
+            nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias),
+            norm_layer(ndf * nf_mult),
+            nn.LeakyReLU(0.2, False)
+        ]
+        sequence += [
+            nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)]  # output 1 channel prediction map
+        self.main = nn.Sequential(*sequence)
+    def forward(self, input):
+        """Standard forward."""
+        return self.main(input)

LHM/models/embedder.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+class CameraEmbedder(nn.Module):
+    """
+    Embed camera features to a high-dimensional vector.
+    Reference:
+    DiT: https://github.com/facebookresearch/DiT/blob/main/models.py#L27
+    """
+    def __init__(self, raw_dim: int, embed_dim: int):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(raw_dim, embed_dim),
+            nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim),
+        )
+    @torch.compile
+    def forward(self, x):
+        return self.mlp(x)

LHM/models/encoders/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Empty

LHM/models/encoders/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (192 Bytes). View file

LHM/models/encoders/__pycache__/dinov2_fusion_wrapper.cpython-310.pyc ADDED Viewed

Binary file (5 kB). View file

LHM/models/encoders/__pycache__/sapiens_warpper.cpython-310.pyc ADDED Viewed

Binary file (9.24 kB). View file

LHM/models/encoders/dino_wrapper.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from transformers import ViTImageProcessor, ViTModel
+from accelerate.logging import get_logger
+logger = get_logger(__name__)
+class DinoWrapper(nn.Module):
+    """
+    Dino v1 wrapper using huggingface transformer implementation.
+    """
+    def __init__(self, model_name: str, freeze: bool = True, encoder_feat_dim: int = 384):
+        super().__init__()
+        self.model, self.processor = self._build_dino(model_name)
+        if freeze:
+            self._freeze()
+    @torch.compile
+    def forward_model(self, inputs):
+        return self.model(**inputs, interpolate_pos_encoding=True)
+    def forward(self, image):
+        # image: [N, C, H, W], on cpu
+        # RGB image with [0,1] scale and properly sized
+        inputs = self.processor(images=image, return_tensors="pt", do_rescale=False, do_resize=False).to(self.model.device)
+        # This resampling of positional embedding uses bicubic interpolation
+        outputs = self.forward_model(inputs)
+        last_hidden_states = outputs.last_hidden_state
+        return last_hidden_states
+    def _freeze(self):
+        logger.warning(f"======== Freezing DinoWrapper ========")
+        self.model.eval()
+        for name, param in self.model.named_parameters():
+            param.requires_grad = False
+    @staticmethod
+    def _build_dino(model_name: str, proxy_error_retries: int = 3, proxy_error_cooldown: int = 5):
+        import requests
+        try:
+            model = ViTModel.from_pretrained(model_name, add_pooling_layer=False)
+            processor = ViTImageProcessor.from_pretrained(model_name)
+            return model, processor
+        except requests.exceptions.ProxyError as err:
+            if proxy_error_retries > 0:
+                print(f"Huggingface ProxyError: Retrying ({proxy_error_retries}) in {proxy_error_cooldown} seconds...")
+                import time
+                time.sleep(proxy_error_cooldown)
+                return DinoWrapper._build_dino(model_name, proxy_error_retries - 1, proxy_error_cooldown)
+            else:
+                raise err

LHM/models/encoders/dinov2/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) 2023-2024, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Empty

LHM/models/encoders/dinov2/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (199 Bytes). View file

LHM/models/encoders/dinov2/hub/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.

LHM/models/encoders/dinov2/hub/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (203 Bytes). View file

LHM/models/encoders/dinov2/hub/__pycache__/backbones.cpython-310.pyc ADDED Viewed

Binary file (4.47 kB). View file

LHM/models/encoders/dinov2/hub/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (1.82 kB). View file

LHM/models/encoders/dinov2/hub/backbones.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+from typing import Union
+import torch
+from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
+class Weights(Enum):
+    LVD142M = "LVD142M"
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    from ..models import vision_transformer as vits
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+    if pretrained:
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        # ********** Modified by Zexin He in 2023-2024 **********
+        state_dict = {k: v for k, v in state_dict.items() if 'mask_token' not in k}  # DDP concern
+        if vit_kwargs.get("modulation_dim") is not None:
+            state_dict = {
+                k.replace('norm1', 'norm1.norm').replace('norm2', 'norm2.norm'): v
+                for k, v in state_dict.items()
+            }
+            model.load_state_dict(state_dict, strict=False)
+        else:
+            model.load_state_dict(state_dict, strict=True)
+        # ********************************************************
+    return model
+def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        **kwargs,
+    )
+def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )

LHM/models/encoders/dinov2/hub/classifiers.py ADDED Viewed

	@@ -0,0 +1,268 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+from typing import Union
+import torch
+import torch.nn as nn
+from .backbones import _make_dinov2_model
+from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
+class Weights(Enum):
+    IMAGENET1K = "IMAGENET1K"
+def _make_dinov2_linear_classification_head(
+    *,
+    arch_name: str = "vit_large",
+    patch_size: int = 14,
+    embed_dim: int = 1024,
+    layers: int = 4,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.IMAGENET1K,
+    num_register_tokens: int = 0,
+    **kwargs,
+):
+    if layers not in (1, 4):
+        raise AssertionError(f"Unsupported number of layers: {layers}")
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+    linear_head = nn.Linear((1 + layers) * embed_dim, 1_000)
+    if pretrained:
+        model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        layers_str = str(layers) if layers == 4 else ""
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_linear{layers_str}_head.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        linear_head.load_state_dict(state_dict, strict=True)
+    return linear_head
+class _LinearClassifierWrapper(nn.Module):
+    def __init__(self, *, backbone: nn.Module, linear_head: nn.Module, layers: int = 4):
+        super().__init__()
+        self.backbone = backbone
+        self.linear_head = linear_head
+        self.layers = layers
+    def forward(self, x):
+        if self.layers == 1:
+            x = self.backbone.forward_features(x)
+            cls_token = x["x_norm_clstoken"]
+            patch_tokens = x["x_norm_patchtokens"]
+            # fmt: off
+            linear_input = torch.cat([
+                cls_token,
+                patch_tokens.mean(dim=1),
+            ], dim=1)
+            # fmt: on
+        elif self.layers == 4:
+            x = self.backbone.get_intermediate_layers(x, n=4, return_class_token=True)
+            # fmt: off
+            linear_input = torch.cat([
+                x[0][1],
+                x[1][1],
+                x[2][1],
+                x[3][1],
+                x[3][0].mean(dim=1),
+            ], dim=1)
+            # fmt: on
+        else:
+            assert False, f"Unsupported number of layers: {self.layers}"
+        return self.linear_head(linear_input)
+def _make_dinov2_linear_classifier(
+    *,
+    arch_name: str = "vit_large",
+    layers: int = 4,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.IMAGENET1K,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    **kwargs,
+):
+    backbone = _make_dinov2_model(
+        arch_name=arch_name,
+        pretrained=pretrained,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+        **kwargs,
+    )
+    embed_dim = backbone.embed_dim
+    patch_size = backbone.patch_size
+    linear_head = _make_dinov2_linear_classification_head(
+        arch_name=arch_name,
+        patch_size=patch_size,
+        embed_dim=embed_dim,
+        layers=layers,
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=num_register_tokens,
+    )
+    return _LinearClassifierWrapper(backbone=backbone, linear_head=linear_head, layers=layers)
+def dinov2_vits14_lc(
+    *,
+    layers: int = 4,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.IMAGENET1K,
+    **kwargs,
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-S/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_small",
+        layers=layers,
+        pretrained=pretrained,
+        weights=weights,
+        **kwargs,
+    )
+def dinov2_vitb14_lc(
+    *,
+    layers: int = 4,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.IMAGENET1K,
+    **kwargs,
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-B/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_base",
+        layers=layers,
+        pretrained=pretrained,
+        weights=weights,
+        **kwargs,
+    )
+def dinov2_vitl14_lc(
+    *,
+    layers: int = 4,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.IMAGENET1K,
+    **kwargs,
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-L/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_large",
+        layers=layers,
+        pretrained=pretrained,
+        weights=weights,
+        **kwargs,
+    )
+def dinov2_vitg14_lc(
+    *,
+    layers: int = 4,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.IMAGENET1K,
+    **kwargs,
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-g/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_giant2",
+        layers=layers,
+        ffn_layer="swiglufused",
+        pretrained=pretrained,
+        weights=weights,
+        **kwargs,
+    )
+def dinov2_vits14_reg_lc(
+    *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-S/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_small",
+        layers=layers,
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitb14_reg_lc(
+    *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-B/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_base",
+        layers=layers,
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitl14_reg_lc(
+    *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-L/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_large",
+        layers=layers,
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitg14_reg_lc(
+    *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs
+):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-g/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name="vit_giant2",
+        layers=layers,
+        ffn_layer="swiglufused",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )

LHM/models/encoders/dinov2/hub/depth/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from .decode_heads import BNHead, DPTHead
+from .encoder_decoder import DepthEncoderDecoder

LHM/models/encoders/dinov2/hub/depth/decode_heads.py ADDED Viewed

	@@ -0,0 +1,747 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import copy
+from functools import partial
+import math
+import warnings
+import torch
+import torch.nn as nn
+from .ops import resize
+# XXX: (Untested) replacement for mmcv.imdenormalize()
+def _imdenormalize(img, mean, std, to_bgr=True):
+    import numpy as np
+    mean = mean.reshape(1, -1).astype(np.float64)
+    std = std.reshape(1, -1).astype(np.float64)
+    img = (img * std) + mean
+    if to_bgr:
+        img = img[::-1]
+    return img
+class DepthBaseDecodeHead(nn.Module):
+    """Base class for BaseDecodeHead.
+    Args:
+        in_channels (List): Input channels.
+        channels (int): Channels after modules, before conv_depth.
+        conv_layer (nn.Module): Conv layers. Default: None.
+        act_layer (nn.Module): Activation layers. Default: nn.ReLU.
+        loss_decode (dict): Config of decode loss.
+            Default: ().
+        sampler (dict|None): The config of depth map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        min_depth (int): Min depth in dataset setting.
+            Default: 1e-3.
+        max_depth (int): Max depth in dataset setting.
+            Default: None.
+        norm_layer (dict|None): Norm layers.
+            Default: None.
+        classify (bool): Whether predict depth in a cls.-reg. manner.
+            Default: False.
+        n_bins (int): The number of bins used in cls. step.
+            Default: 256.
+        bins_strategy (str): The discrete strategy used in cls. step.
+            Default: 'UD'.
+        norm_strategy (str): The norm strategy on cls. probability
+            distribution. Default: 'linear'
+        scale_up (str): Whether predict depth in a scale-up manner.
+            Default: False.
+    """
+    def __init__(
+        self,
+        in_channels,
+        conv_layer=None,
+        act_layer=nn.ReLU,
+        channels=96,
+        loss_decode=(),
+        sampler=None,
+        align_corners=False,
+        min_depth=1e-3,
+        max_depth=None,
+        norm_layer=None,
+        classify=False,
+        n_bins=256,
+        bins_strategy="UD",
+        norm_strategy="linear",
+        scale_up=False,
+    ):
+        super(DepthBaseDecodeHead, self).__init__()
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conf_layer = conv_layer
+        self.act_layer = act_layer
+        self.loss_decode = loss_decode
+        self.align_corners = align_corners
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.norm_layer = norm_layer
+        self.classify = classify
+        self.n_bins = n_bins
+        self.scale_up = scale_up
+        if self.classify:
+            assert bins_strategy in ["UD", "SID"], "Support bins_strategy: UD, SID"
+            assert norm_strategy in ["linear", "softmax", "sigmoid"], "Support norm_strategy: linear, softmax, sigmoid"
+            self.bins_strategy = bins_strategy
+            self.norm_strategy = norm_strategy
+            self.softmax = nn.Softmax(dim=1)
+            self.conv_depth = nn.Conv2d(channels, n_bins, kernel_size=3, padding=1, stride=1)
+        else:
+            self.conv_depth = nn.Conv2d(channels, 1, kernel_size=3, padding=1, stride=1)
+        self.relu = nn.ReLU()
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, inputs, img_metas):
+        """Placeholder of forward function."""
+        pass
+    def forward_train(self, img, inputs, img_metas, depth_gt):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `depth/datasets/pipelines/formatting.py:Collect`.
+            depth_gt (Tensor): GT depth
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        depth_pred = self.forward(inputs, img_metas)
+        losses = self.losses(depth_pred, depth_gt)
+        log_imgs = self.log_images(img[0], depth_pred[0], depth_gt[0], img_metas[0])
+        losses.update(**log_imgs)
+        return losses
+    def forward_test(self, inputs, img_metas):
+        """Forward function for testing.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `depth/datasets/pipelines/formatting.py:Collect`.
+        Returns:
+            Tensor: Output depth map.
+        """
+        return self.forward(inputs, img_metas)
+    def depth_pred(self, feat):
+        """Prediction each pixel."""
+        if self.classify:
+            logit = self.conv_depth(feat)
+            if self.bins_strategy == "UD":
+                bins = torch.linspace(self.min_depth, self.max_depth, self.n_bins, device=feat.device)
+            elif self.bins_strategy == "SID":
+                bins = torch.logspace(self.min_depth, self.max_depth, self.n_bins, device=feat.device)
+            # following Adabins, default linear
+            if self.norm_strategy == "linear":
+                logit = torch.relu(logit)
+                eps = 0.1
+                logit = logit + eps
+                logit = logit / logit.sum(dim=1, keepdim=True)
+            elif self.norm_strategy == "softmax":
+                logit = torch.softmax(logit, dim=1)
+            elif self.norm_strategy == "sigmoid":
+                logit = torch.sigmoid(logit)
+                logit = logit / logit.sum(dim=1, keepdim=True)
+            output = torch.einsum("ikmn,k->imn", [logit, bins]).unsqueeze(dim=1)
+        else:
+            if self.scale_up:
+                output = self.sigmoid(self.conv_depth(feat)) * self.max_depth
+            else:
+                output = self.relu(self.conv_depth(feat)) + self.min_depth
+        return output
+    def losses(self, depth_pred, depth_gt):
+        """Compute depth loss."""
+        loss = dict()
+        depth_pred = resize(
+            input=depth_pred, size=depth_gt.shape[2:], mode="bilinear", align_corners=self.align_corners, warning=False
+        )
+        if not isinstance(self.loss_decode, nn.ModuleList):
+            losses_decode = [self.loss_decode]
+        else:
+            losses_decode = self.loss_decode
+        for loss_decode in losses_decode:
+            if loss_decode.loss_name not in loss:
+                loss[loss_decode.loss_name] = loss_decode(depth_pred, depth_gt)
+            else:
+                loss[loss_decode.loss_name] += loss_decode(depth_pred, depth_gt)
+        return loss
+    def log_images(self, img_path, depth_pred, depth_gt, img_meta):
+        import numpy as np
+        show_img = copy.deepcopy(img_path.detach().cpu().permute(1, 2, 0))
+        show_img = show_img.numpy().astype(np.float32)
+        show_img = _imdenormalize(
+            show_img,
+            img_meta["img_norm_cfg"]["mean"],
+            img_meta["img_norm_cfg"]["std"],
+            img_meta["img_norm_cfg"]["to_rgb"],
+        )
+        show_img = np.clip(show_img, 0, 255)
+        show_img = show_img.astype(np.uint8)
+        show_img = show_img[:, :, ::-1]
+        show_img = show_img.transpose(0, 2, 1)
+        show_img = show_img.transpose(1, 0, 2)
+        depth_pred = depth_pred / torch.max(depth_pred)
+        depth_gt = depth_gt / torch.max(depth_gt)
+        depth_pred_color = copy.deepcopy(depth_pred.detach().cpu())
+        depth_gt_color = copy.deepcopy(depth_gt.detach().cpu())
+        return {"img_rgb": show_img, "img_depth_pred": depth_pred_color, "img_depth_gt": depth_gt_color}
+class BNHead(DepthBaseDecodeHead):
+    """Just a batchnorm."""
+    def __init__(self, input_transform="resize_concat", in_index=(0, 1, 2, 3), upsample=1, **kwargs):
+        super().__init__(**kwargs)
+        self.input_transform = input_transform
+        self.in_index = in_index
+        self.upsample = upsample
+        # self.bn = nn.SyncBatchNorm(self.in_channels)
+        if self.classify:
+            self.conv_depth = nn.Conv2d(self.channels, self.n_bins, kernel_size=1, padding=0, stride=1)
+        else:
+            self.conv_depth = nn.Conv2d(self.channels, 1, kernel_size=1, padding=0, stride=1)
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if "concat" in self.input_transform:
+            inputs = [inputs[i] for i in self.in_index]
+            if "resize" in self.input_transform:
+                inputs = [
+                    resize(
+                        input=x,
+                        size=[s * self.upsample for s in inputs[0].shape[2:]],
+                        mode="bilinear",
+                        align_corners=self.align_corners,
+                    )
+                    for x in inputs
+                ]
+            inputs = torch.cat(inputs, dim=1)
+        elif self.input_transform == "multiple_select":
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+        return inputs
+    def _forward_feature(self, inputs, img_metas=None, **kwargs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        # accept lists (for cls token)
+        inputs = list(inputs)
+        for i, x in enumerate(inputs):
+            if len(x) == 2:
+                x, cls_token = x[0], x[1]
+                if len(x.shape) == 2:
+                    x = x[:, :, None, None]
+                cls_token = cls_token[:, :, None, None].expand_as(x)
+                inputs[i] = torch.cat((x, cls_token), 1)
+            else:
+                x = x[0]
+                if len(x.shape) == 2:
+                    x = x[:, :, None, None]
+                inputs[i] = x
+        x = self._transform_inputs(inputs)
+        # feats = self.bn(x)
+        return x
+    def forward(self, inputs, img_metas=None, **kwargs):
+        """Forward function."""
+        output = self._forward_feature(inputs, img_metas=img_metas, **kwargs)
+        output = self.depth_pred(output)
+        return output
+class ConvModule(nn.Module):
+    """A conv block that bundles conv/norm/activation layers.
+    This block simplifies the usage of convolution layers, which are commonly
+    used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
+    It is based upon three build methods: `build_conv_layer()`,
+    `build_norm_layer()` and `build_activation_layer()`.
+    Besides, we add some additional features in this module.
+    1. Automatically set `bias` of the conv layer.
+    2. Spectral norm is supported.
+    3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
+    supports zero and circular padding, and we add "reflect" padding mode.
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+            Same as that in ``nn._ConvNd``.
+        out_channels (int): Number of channels produced by the convolution.
+            Same as that in ``nn._ConvNd``.
+        kernel_size (int | tuple[int]): Size of the convolving kernel.
+            Same as that in ``nn._ConvNd``.
+        stride (int | tuple[int]): Stride of the convolution.
+            Same as that in ``nn._ConvNd``.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+            the input. Same as that in ``nn._ConvNd``.
+        dilation (int | tuple[int]): Spacing between kernel elements.
+            Same as that in ``nn._ConvNd``.
+        groups (int): Number of blocked connections from input channels to
+            output channels. Same as that in ``nn._ConvNd``.
+        bias (bool | str): If specified as `auto`, it will be decided by the
+            norm_layer. Bias will be set as True if `norm_layer` is None, otherwise
+            False. Default: "auto".
+        conv_layer (nn.Module): Convolution layer. Default: None,
+            which means using conv2d.
+        norm_layer (nn.Module): Normalization layer. Default: None.
+        act_layer (nn.Module): Activation layer. Default: nn.ReLU.
+        inplace (bool): Whether to use inplace mode for activation.
+            Default: True.
+        with_spectral_norm (bool): Whether use spectral norm in conv module.
+            Default: False.
+        padding_mode (str): If the `padding_mode` has not been supported by
+            current `Conv2d` in PyTorch, we will use our own padding layer
+            instead. Currently, we support ['zeros', 'circular'] with official
+            implementation and ['reflect'] with our own implementation.
+            Default: 'zeros'.
+        order (tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Common examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+            Default: ('conv', 'norm', 'act').
+    """
+    _abbr_ = "conv_block"
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias="auto",
+        conv_layer=nn.Conv2d,
+        norm_layer=None,
+        act_layer=nn.ReLU,
+        inplace=True,
+        with_spectral_norm=False,
+        padding_mode="zeros",
+        order=("conv", "norm", "act"),
+    ):
+        super(ConvModule, self).__init__()
+        official_padding_mode = ["zeros", "circular"]
+        self.conv_layer = conv_layer
+        self.norm_layer = norm_layer
+        self.act_layer = act_layer
+        self.inplace = inplace
+        self.with_spectral_norm = with_spectral_norm
+        self.with_explicit_padding = padding_mode not in official_padding_mode
+        self.order = order
+        assert isinstance(self.order, tuple) and len(self.order) == 3
+        assert set(order) == set(["conv", "norm", "act"])
+        self.with_norm = norm_layer is not None
+        self.with_activation = act_layer is not None
+        # if the conv layer is before a norm layer, bias is unnecessary.
+        if bias == "auto":
+            bias = not self.with_norm
+        self.with_bias = bias
+        if self.with_explicit_padding:
+            if padding_mode == "zeros":
+                padding_layer = nn.ZeroPad2d
+            else:
+                raise AssertionError(f"Unsupported padding mode: {padding_mode}")
+            self.pad = padding_layer(padding)
+        # reset padding to 0 for conv module
+        conv_padding = 0 if self.with_explicit_padding else padding
+        # build convolution layer
+        self.conv = self.conv_layer(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=conv_padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        # export the attributes of self.conv to a higher level for convenience
+        self.in_channels = self.conv.in_channels
+        self.out_channels = self.conv.out_channels
+        self.kernel_size = self.conv.kernel_size
+        self.stride = self.conv.stride
+        self.padding = padding
+        self.dilation = self.conv.dilation
+        self.transposed = self.conv.transposed
+        self.output_padding = self.conv.output_padding
+        self.groups = self.conv.groups
+        if self.with_spectral_norm:
+            self.conv = nn.utils.spectral_norm(self.conv)
+        # build normalization layers
+        if self.with_norm:
+            # norm layer is after conv layer
+            if order.index("norm") > order.index("conv"):
+                norm_channels = out_channels
+            else:
+                norm_channels = in_channels
+            norm = partial(norm_layer, num_features=norm_channels)
+            self.add_module("norm", norm)
+            if self.with_bias:
+                from torch.nnModules.batchnorm import _BatchNorm
+                from torch.nnModules.instancenorm import _InstanceNorm
+                if isinstance(norm, (_BatchNorm, _InstanceNorm)):
+                    warnings.warn("Unnecessary conv bias before batch/instance norm")
+        else:
+            self.norm_name = None
+        # build activation layer
+        if self.with_activation:
+            # nn.Tanh has no 'inplace' argument
+            # (nn.Tanh, nn.PReLU, nn.Sigmoid, nn.HSigmoid, nn.Swish, nn.GELU)
+            if not isinstance(act_layer, (nn.Tanh, nn.PReLU, nn.Sigmoid, nn.GELU)):
+                act_layer = partial(act_layer, inplace=inplace)
+            self.activate = act_layer()
+        # Use msra init by default
+        self.init_weights()
+    @property
+    def norm(self):
+        if self.norm_name:
+            return getattr(self, self.norm_name)
+        else:
+            return None
+    def init_weights(self):
+        # 1. It is mainly for customized conv layers with their own
+        #    initialization manners by calling their own ``init_weights()``,
+        #    and we do not want ConvModule to override the initialization.
+        # 2. For customized conv layers without their own initialization
+        #    manners (that is, they don't have their own ``init_weights()``)
+        #    and PyTorch's conv layers, they will be initialized by
+        #    this method with default ``kaiming_init``.
+        # Note: For PyTorch's conv layers, they will be overwritten by our
+        #    initialization implementation using default ``kaiming_init``.
+        if not hasattr(self.conv, "init_weights"):
+            if self.with_activation and isinstance(self.act_layer, nn.LeakyReLU):
+                nonlinearity = "leaky_relu"
+                a = 0.01  # XXX: default negative_slope
+            else:
+                nonlinearity = "relu"
+                a = 0
+            if hasattr(self.conv, "weight") and self.conv.weight is not None:
+                nn.init.kaiming_normal_(self.conv.weight, a=a, mode="fan_out", nonlinearity=nonlinearity)
+            if hasattr(self.conv, "bias") and self.conv.bias is not None:
+                nn.init.constant_(self.conv.bias, 0)
+        if self.with_norm:
+            if hasattr(self.norm, "weight") and self.norm.weight is not None:
+                nn.init.constant_(self.norm.weight, 1)
+            if hasattr(self.norm, "bias") and self.norm.bias is not None:
+                nn.init.constant_(self.norm.bias, 0)
+    def forward(self, x, activate=True, norm=True):
+        for layer in self.order:
+            if layer == "conv":
+                if self.with_explicit_padding:
+                    x = self.pad(x)
+                x = self.conv(x)
+            elif layer == "norm" and norm and self.with_norm:
+                x = self.norm(x)
+            elif layer == "act" and activate and self.with_activation:
+                x = self.activate(x)
+        return x
+class Interpolate(nn.Module):
+    def __init__(self, scale_factor, mode, align_corners=False):
+        super(Interpolate, self).__init__()
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+    def forward(self, x):
+        x = self.interp(x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners)
+        return x
+class HeadDepth(nn.Module):
+    def __init__(self, features):
+        super(HeadDepth, self).__init__()
+        self.head = nn.Sequential(
+            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
+            Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+        )
+    def forward(self, x):
+        x = self.head(x)
+        return x
+class ReassembleBlocks(nn.Module):
+    """ViTPostProcessBlock, process cls_token in ViT backbone output and
+    rearrange the feature vector to feature map.
+    Args:
+        in_channels (int): ViT feature channels. Default: 768.
+        out_channels (List): output channels of each stage.
+            Default: [96, 192, 384, 768].
+        readout_type (str): Type of readout operation. Default: 'ignore'.
+        patch_size (int): The patch size. Default: 16.
+    """
+    def __init__(self, in_channels=768, out_channels=[96, 192, 384, 768], readout_type="ignore", patch_size=16):
+        super(ReassembleBlocks, self).__init__()
+        assert readout_type in ["ignore", "add", "project"]
+        self.readout_type = readout_type
+        self.patch_size = patch_size
+        self.projects = nn.ModuleList(
+            [
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=out_channel,
+                    kernel_size=1,
+                    act_layer=None,
+                )
+                for out_channel in out_channels
+            ]
+        )
+        self.resize_layers = nn.ModuleList(
+            [
+                nn.ConvTranspose2d(
+                    in_channels=out_channels[0], out_channels=out_channels[0], kernel_size=4, stride=4, padding=0
+                ),
+                nn.ConvTranspose2d(
+                    in_channels=out_channels[1], out_channels=out_channels[1], kernel_size=2, stride=2, padding=0
+                ),
+                nn.Identity(),
+                nn.Conv2d(
+                    in_channels=out_channels[3], out_channels=out_channels[3], kernel_size=3, stride=2, padding=1
+                ),
+            ]
+        )
+        if self.readout_type == "project":
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(nn.Sequential(nn.Linear(2 * in_channels, in_channels), nn.GELU()))
+    def forward(self, inputs):
+        assert isinstance(inputs, list)
+        out = []
+        for i, x in enumerate(inputs):
+            assert len(x) == 2
+            x, cls_token = x[0], x[1]
+            feature_shape = x.shape
+            if self.readout_type == "project":
+                x = x.flatten(2).permute((0, 2, 1))
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+                x = x.permute(0, 2, 1).reshape(feature_shape)
+            elif self.readout_type == "add":
+                x = x.flatten(2) + cls_token.unsqueeze(-1)
+                x = x.reshape(feature_shape)
+            else:
+                pass
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        return out
+class PreActResidualConvUnit(nn.Module):
+    """ResidualConvUnit, pre-activate residual unit.
+    Args:
+        in_channels (int): number of channels in the input feature map.
+        act_layer (nn.Module): activation layer.
+        norm_layer (nn.Module): norm layer.
+        stride (int): stride of the first block. Default: 1
+        dilation (int): dilation rate for convs layers. Default: 1.
+    """
+    def __init__(self, in_channels, act_layer, norm_layer, stride=1, dilation=1):
+        super(PreActResidualConvUnit, self).__init__()
+        self.conv1 = ConvModule(
+            in_channels,
+            in_channels,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            norm_layer=norm_layer,
+            act_layer=act_layer,
+            bias=False,
+            order=("act", "conv", "norm"),
+        )
+        self.conv2 = ConvModule(
+            in_channels,
+            in_channels,
+            3,
+            padding=1,
+            norm_layer=norm_layer,
+            act_layer=act_layer,
+            bias=False,
+            order=("act", "conv", "norm"),
+        )
+    def forward(self, inputs):
+        inputs_ = inputs.clone()
+        x = self.conv1(inputs)
+        x = self.conv2(x)
+        return x + inputs_
+class FeatureFusionBlock(nn.Module):
+    """FeatureFusionBlock, merge feature map from different stages.
+    Args:
+        in_channels (int): Input channels.
+        act_layer (nn.Module): activation layer for ResidualConvUnit.
+        norm_layer (nn.Module): normalization layer.
+        expand (bool): Whether expand the channels in post process block.
+            Default: False.
+        align_corners (bool): align_corner setting for bilinear upsample.
+            Default: True.
+    """
+    def __init__(self, in_channels, act_layer, norm_layer, expand=False, align_corners=True):
+        super(FeatureFusionBlock, self).__init__()
+        self.in_channels = in_channels
+        self.expand = expand
+        self.align_corners = align_corners
+        self.out_channels = in_channels
+        if self.expand:
+            self.out_channels = in_channels // 2
+        self.project = ConvModule(self.in_channels, self.out_channels, kernel_size=1, act_layer=None, bias=True)
+        self.res_conv_unit1 = PreActResidualConvUnit(
+            in_channels=self.in_channels, act_layer=act_layer, norm_layer=norm_layer
+        )
+        self.res_conv_unit2 = PreActResidualConvUnit(
+            in_channels=self.in_channels, act_layer=act_layer, norm_layer=norm_layer
+        )
+    def forward(self, *inputs):
+        x = inputs[0]
+        if len(inputs) == 2:
+            if x.shape != inputs[1].shape:
+                res = resize(inputs[1], size=(x.shape[2], x.shape[3]), mode="bilinear", align_corners=False)
+            else:
+                res = inputs[1]
+            x = x + self.res_conv_unit1(res)
+        x = self.res_conv_unit2(x)
+        x = resize(x, scale_factor=2, mode="bilinear", align_corners=self.align_corners)
+        x = self.project(x)
+        return x
+class DPTHead(DepthBaseDecodeHead):
+    """Vision Transformers for Dense Prediction.
+    This head is implemented of `DPT <https://arxiv.org/abs/2103.13413>`_.
+    Args:
+        embed_dims (int): The embed dimension of the ViT backbone.
+            Default: 768.
+        post_process_channels (List): Out channels of post process conv
+            layers. Default: [96, 192, 384, 768].
+        readout_type (str): Type of readout operation. Default: 'ignore'.
+        patch_size (int): The patch size. Default: 16.
+        expand_channels (bool): Whether expand the channels in post process
+            block. Default: False.
+    """
+    def __init__(
+        self,
+        embed_dims=768,
+        post_process_channels=[96, 192, 384, 768],
+        readout_type="ignore",
+        patch_size=16,
+        expand_channels=False,
+        **kwargs,
+    ):
+        super(DPTHead, self).__init__(**kwargs)
+        self.in_channels = self.in_channels
+        self.expand_channels = expand_channels
+        self.reassemble_blocks = ReassembleBlocks(embed_dims, post_process_channels, readout_type, patch_size)
+        self.post_process_channels = [
+            channel * math.pow(2, i) if expand_channels else channel for i, channel in enumerate(post_process_channels)
+        ]
+        self.convs = nn.ModuleList()
+        for channel in self.post_process_channels:
+            self.convs.append(ConvModule(channel, self.channels, kernel_size=3, padding=1, act_layer=None, bias=False))
+        self.fusion_blocks = nn.ModuleList()
+        for _ in range(len(self.convs)):
+            self.fusion_blocks.append(FeatureFusionBlock(self.channels, self.act_layer, self.norm_layer))
+        self.fusion_blocks[0].res_conv_unit1 = None
+        self.project = ConvModule(self.channels, self.channels, kernel_size=3, padding=1, norm_layer=self.norm_layer)
+        self.num_fusion_blocks = len(self.fusion_blocks)
+        self.num_reassemble_blocks = len(self.reassemble_blocks.resize_layers)
+        self.num_post_process_channels = len(self.post_process_channels)
+        assert self.num_fusion_blocks == self.num_reassemble_blocks
+        assert self.num_reassemble_blocks == self.num_post_process_channels
+        self.conv_depth = HeadDepth(self.channels)
+    def forward(self, inputs, img_metas):
+        assert len(inputs) == self.num_reassemble_blocks
+        x = [inp for inp in inputs]
+        x = self.reassemble_blocks(x)
+        x = [self.convs[i](feature) for i, feature in enumerate(x)]
+        out = self.fusion_blocks[0](x[-1])
+        for i in range(1, len(self.fusion_blocks)):
+            out = self.fusion_blocks[i](out, x[-(i + 1)])
+        out = self.project(out)
+        out = self.depth_pred(out)
+        return out

LHM/models/encoders/dinov2/hub/depth/encoder_decoder.py ADDED Viewed

	@@ -0,0 +1,351 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .ops import resize
+def add_prefix(inputs, prefix):
+    """Add prefix for dict.
+    Args:
+        inputs (dict): The input dict with str keys.
+        prefix (str): The prefix to add.
+    Returns:
+        dict: The dict with keys updated with ``prefix``.
+    """
+    outputs = dict()
+    for name, value in inputs.items():
+        outputs[f"{prefix}.{name}"] = value
+    return outputs
+class DepthEncoderDecoder(nn.Module):
+    """Encoder Decoder depther.
+    EncoderDecoder typically consists of backbone and decode_head.
+    """
+    def __init__(self, backbone, decode_head):
+        super(DepthEncoderDecoder, self).__init__()
+        self.backbone = backbone
+        self.decode_head = decode_head
+        self.align_corners = self.decode_head.align_corners
+    def extract_feat(self, img):
+        """Extract features from images."""
+        return self.backbone(img)
+    def encode_decode(self, img, img_metas, rescale=True, size=None):
+        """Encode images with backbone and decode into a depth estimation
+        map of the same size as input."""
+        x = self.extract_feat(img)
+        out = self._decode_head_forward_test(x, img_metas)
+        # crop the pred depth to the certain range.
+        out = torch.clamp(out, min=self.decode_head.min_depth, max=self.decode_head.max_depth)
+        if rescale:
+            if size is None:
+                if img_metas is not None:
+                    size = img_metas[0]["ori_shape"][:2]
+                else:
+                    size = img.shape[2:]
+            out = resize(input=out, size=size, mode="bilinear", align_corners=self.align_corners)
+        return out
+    def _decode_head_forward_train(self, img, x, img_metas, depth_gt, **kwargs):
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.forward_train(img, x, img_metas, depth_gt, **kwargs)
+        losses.update(add_prefix(loss_decode, "decode"))
+        return losses
+    def _decode_head_forward_test(self, x, img_metas):
+        """Run forward function and calculate loss for decode head in
+        inference."""
+        depth_pred = self.decode_head.forward_test(x, img_metas)
+        return depth_pred
+    def forward_dummy(self, img):
+        """Dummy forward function."""
+        depth = self.encode_decode(img, None)
+        return depth
+    def forward_train(self, img, img_metas, depth_gt, **kwargs):
+        """Forward function for training.
+        Args:
+            img (Tensor): Input images.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `depth/datasets/pipelines/formatting.py:Collect`.
+            depth_gt (Tensor): Depth gt
+                used if the architecture supports depth estimation task.
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        x = self.extract_feat(img)
+        losses = dict()
+        # the last of x saves the info from neck
+        loss_decode = self._decode_head_forward_train(img, x, img_metas, depth_gt, **kwargs)
+        losses.update(loss_decode)
+        return losses
+    def whole_inference(self, img, img_meta, rescale, size=None):
+        """Inference with full image."""
+        return self.encode_decode(img, img_meta, rescale, size=size)
+    def slide_inference(self, img, img_meta, rescale, stride, crop_size):
+        """Inference by sliding-window with overlap.
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+        """
+        h_stride, w_stride = stride
+        h_crop, w_crop = crop_size
+        batch_size, _, h_img, w_img = img.size()
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = img.new_zeros((batch_size, 1, h_img, w_img))
+        count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = img[:, :, y1:y2, x1:x2]
+                depth_pred = self.encode_decode(crop_img, img_meta, rescale)
+                preds += F.pad(depth_pred, (int(x1), int(preds.shape[3] - x2), int(y1), int(preds.shape[2] - y2)))
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        if torch.onnx.is_in_onnx_export():
+            # cast count_mat to constant while exporting to ONNX
+            count_mat = torch.from_numpy(count_mat.cpu().detach().numpy()).to(device=img.device)
+        preds = preds / count_mat
+        return preds
+    def inference(self, img, img_meta, rescale, size=None, mode="whole"):
+        """Inference with slide/whole style.
+        Args:
+            img (Tensor): The input image of shape (N, 3, H, W).
+            img_meta (dict): Image info dict where each dict has: 'img_shape',
+                'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `depth/datasets/pipelines/formatting.py:Collect`.
+            rescale (bool): Whether rescale back to original shape.
+        Returns:
+            Tensor: The output depth map.
+        """
+        assert mode in ["slide", "whole"]
+        ori_shape = img_meta[0]["ori_shape"]
+        assert all(_["ori_shape"] == ori_shape for _ in img_meta)
+        if mode == "slide":
+            depth_pred = self.slide_inference(img, img_meta, rescale)
+        else:
+            depth_pred = self.whole_inference(img, img_meta, rescale, size=size)
+        output = depth_pred
+        flip = img_meta[0]["flip"]
+        if flip:
+            flip_direction = img_meta[0]["flip_direction"]
+            assert flip_direction in ["horizontal", "vertical"]
+            if flip_direction == "horizontal":
+                output = output.flip(dims=(3,))
+            elif flip_direction == "vertical":
+                output = output.flip(dims=(2,))
+        return output
+    def simple_test(self, img, img_meta, rescale=True):
+        """Simple test with single image."""
+        depth_pred = self.inference(img, img_meta, rescale)
+        if torch.onnx.is_in_onnx_export():
+            # our inference backend only support 4D output
+            depth_pred = depth_pred.unsqueeze(0)
+            return depth_pred
+        depth_pred = depth_pred.cpu().numpy()
+        # unravel batch dim
+        depth_pred = list(depth_pred)
+        return depth_pred
+    def aug_test(self, imgs, img_metas, rescale=True):
+        """Test with augmentations.
+        Only rescale=True is supported.
+        """
+        # aug_test rescale all imgs back to ori_shape for now
+        assert rescale
+        # to save memory, we get augmented depth logit inplace
+        depth_pred = self.inference(imgs[0], img_metas[0], rescale)
+        for i in range(1, len(imgs)):
+            cur_depth_pred = self.inference(imgs[i], img_metas[i], rescale, size=depth_pred.shape[-2:])
+            depth_pred += cur_depth_pred
+        depth_pred /= len(imgs)
+        depth_pred = depth_pred.cpu().numpy()
+        # unravel batch dim
+        depth_pred = list(depth_pred)
+        return depth_pred
+    def forward_test(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+        """
+        for var, name in [(imgs, "imgs"), (img_metas, "img_metas")]:
+            if not isinstance(var, list):
+                raise TypeError(f"{name} must be a list, but got " f"{type(var)}")
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(f"num of augmentations ({len(imgs)}) != " f"num of image meta ({len(img_metas)})")
+        # all images in the same aug batch all of the same ori_shape and pad
+        # shape
+        for img_meta in img_metas:
+            ori_shapes = [_["ori_shape"] for _ in img_meta]
+            assert all(shape == ori_shapes[0] for shape in ori_shapes)
+            img_shapes = [_["img_shape"] for _ in img_meta]
+            assert all(shape == img_shapes[0] for shape in img_shapes)
+            pad_shapes = [_["pad_shape"] for _ in img_meta]
+            assert all(shape == pad_shapes[0] for shape in pad_shapes)
+        if num_augs == 1:
+            return self.simple_test(imgs[0], img_metas[0], **kwargs)
+        else:
+            return self.aug_test(imgs, img_metas, **kwargs)
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Calls either :func:`forward_train` or :func:`forward_test` depending
+        on whether ``return_loss`` is ``True``.
+        Note this setting will change the expected inputs. When
+        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
+        and List[dict]), and when ``resturn_loss=False``, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+        """
+        if return_loss:
+            return self.forward_train(img, img_metas, **kwargs)
+        else:
+            return self.forward_test(img, img_metas, **kwargs)
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during training.
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+        Args:
+            data (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
+                ``num_samples``.
+                ``loss`` is a tensor for back propagation, which can be a
+                weighted sum of multiple losses.
+                ``log_vars`` contains all the variables to be sent to the
+                logger.
+                ``num_samples`` indicates the batch size (when the model is
+                DDP, it means the batch size on each GPU), which is used for
+                averaging the logs.
+        """
+        losses = self(**data_batch)
+        # split losses and images
+        real_losses = {}
+        log_imgs = {}
+        for k, v in losses.items():
+            if "img" in k:
+                log_imgs[k] = v
+            else:
+                real_losses[k] = v
+        loss, log_vars = self._parse_losses(real_losses)
+        outputs = dict(loss=loss, log_vars=log_vars, num_samples=len(data_batch["img_metas"]), log_imgs=log_imgs)
+        return outputs
+    def val_step(self, data_batch, **kwargs):
+        """The iteration step during validation.
+        This method shares the same signature as :func:`train_step`, but used
+        during val epochs. Note that the evaluation after training epochs is
+        not implemented with this method, but an evaluation hook.
+        """
+        output = self(**data_batch, **kwargs)
+        return output
+    @staticmethod
+    def _parse_losses(losses):
+        import torch.distributed as dist
+        """Parse the raw outputs (losses) of the network.
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
+                which may be a weighted sum of all losses, log_vars contains
+                all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(f"{loss_name} is not a tensor or list of tensors")
+        loss = sum(_value for _key, _value in log_vars.items() if "loss" in _key)
+        log_vars["loss"] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if dist.is_available() and dist.is_initialized():
+                loss_value = loss_value.data.clone()
+                dist.all_reduce(loss_value.div_(dist.get_world_size()))
+            log_vars[loss_name] = loss_value.item()
+        return loss, log_vars

LHM/models/encoders/dinov2/hub/depth/ops.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import warnings
+import torch.nn.functional as F
+def resize(input, size=None, scale_factor=None, mode="nearest", align_corners=None, warning=False):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > output_h:
+                if (
+                    (output_h > 1 and output_w > 1 and input_h > 1 and input_w > 1)
+                    and (output_h - 1) % (input_h - 1)
+                    and (output_w - 1) % (input_w - 1)
+                ):
+                    warnings.warn(
+                        f"When align_corners={align_corners}, "
+                        "the output would more aligned if "
+                        f"input size {(input_h, input_w)} is `x+1` and "
+                        f"out size {(output_h, output_w)} is `nx+1`"
+                    )
+    return F.interpolate(input, size, scale_factor, mode, align_corners)

LHM/models/encoders/dinov2/hub/depthers.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+from functools import partial
+from typing import Optional, Tuple, Union
+import torch
+from .backbones import _make_dinov2_model
+from .depth import BNHead, DepthEncoderDecoder, DPTHead
+from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name, CenterPadding
+class Weights(Enum):
+    NYU = "NYU"
+    KITTI = "KITTI"
+def _get_depth_range(pretrained: bool, weights: Weights = Weights.NYU) -> Tuple[float, float]:
+    if not pretrained:  # Default
+        return (0.001, 10.0)
+    # Pretrained, set according to the training dataset for the provided weights
+    if weights == Weights.KITTI:
+        return (0.001, 80.0)
+    if weights == Weights.NYU:
+        return (0.001, 10.0)
+    return (0.001, 10.0)
+def _make_dinov2_linear_depth_head(
+    *,
+    embed_dim: int,
+    layers: int,
+    min_depth: float,
+    max_depth: float,
+    **kwargs,
+):
+    if layers not in (1, 4):
+        raise AssertionError(f"Unsupported number of layers: {layers}")
+    if layers == 1:
+        in_index = [0]
+    else:
+        assert layers == 4
+        in_index = [0, 1, 2, 3]
+    return BNHead(
+        classify=True,
+        n_bins=256,
+        bins_strategy="UD",
+        norm_strategy="linear",
+        upsample=4,
+        in_channels=[embed_dim] * len(in_index),
+        in_index=in_index,
+        input_transform="resize_concat",
+        channels=embed_dim * len(in_index) * 2,
+        align_corners=False,
+        min_depth=0.001,
+        max_depth=80,
+        loss_decode=(),
+    )
+def _make_dinov2_linear_depther(
+    *,
+    arch_name: str = "vit_large",
+    layers: int = 4,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.NYU,
+    depth_range: Optional[Tuple[float, float]] = None,
+    **kwargs,
+):
+    if layers not in (1, 4):
+        raise AssertionError(f"Unsupported number of layers: {layers}")
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+    if depth_range is None:
+        depth_range = _get_depth_range(pretrained, weights)
+    min_depth, max_depth = depth_range
+    backbone = _make_dinov2_model(arch_name=arch_name, pretrained=pretrained, **kwargs)
+    embed_dim = backbone.embed_dim
+    patch_size = backbone.patch_size
+    model_name = _make_dinov2_model_name(arch_name, patch_size)
+    linear_depth_head = _make_dinov2_linear_depth_head(
+        embed_dim=embed_dim,
+        layers=layers,
+        min_depth=min_depth,
+        max_depth=max_depth,
+    )
+    layer_count = {
+        "vit_small": 12,
+        "vit_base": 12,
+        "vit_large": 24,
+        "vit_giant2": 40,
+    }[arch_name]
+    if layers == 4:
+        out_index = {
+            "vit_small": [2, 5, 8, 11],
+            "vit_base": [2, 5, 8, 11],
+            "vit_large": [4, 11, 17, 23],
+            "vit_giant2": [9, 19, 29, 39],
+        }[arch_name]
+    else:
+        assert layers == 1
+        out_index = [layer_count - 1]
+    model = DepthEncoderDecoder(backbone=backbone, decode_head=linear_depth_head)
+    model.backbone.forward = partial(
+        backbone.get_intermediate_layers,
+        n=out_index,
+        reshape=True,
+        return_class_token=True,
+        norm=False,
+    )
+    model.backbone.register_forward_pre_hook(lambda _, x: CenterPadding(patch_size)(x[0]))
+    if pretrained:
+        layers_str = str(layers) if layers == 4 else ""
+        weights_str = weights.value.lower()
+        url = _DINOV2_BASE_URL + f"/{model_name}/{model_name}_{weights_str}_linear{layers_str}_head.pth"
+        checkpoint = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        if "state_dict" in checkpoint:
+            state_dict = checkpoint["state_dict"]
+        model.load_state_dict(state_dict, strict=False)
+    return model
+def dinov2_vits14_ld(*, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_linear_depther(
+        arch_name="vit_small", layers=layers, pretrained=pretrained, weights=weights, **kwargs
+    )
+def dinov2_vitb14_ld(*, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_linear_depther(
+        arch_name="vit_base", layers=layers, pretrained=pretrained, weights=weights, **kwargs
+    )
+def dinov2_vitl14_ld(*, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_linear_depther(
+        arch_name="vit_large", layers=layers, pretrained=pretrained, weights=weights, **kwargs
+    )
+def dinov2_vitg14_ld(*, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_linear_depther(
+        arch_name="vit_giant2", layers=layers, ffn_layer="swiglufused", pretrained=pretrained, weights=weights, **kwargs
+    )
+def _make_dinov2_dpt_depth_head(*, embed_dim: int, min_depth: float, max_depth: float):
+    return DPTHead(
+        in_channels=[embed_dim] * 4,
+        channels=256,
+        embed_dims=embed_dim,
+        post_process_channels=[embed_dim // 2 ** (3 - i) for i in range(4)],
+        readout_type="project",
+        min_depth=min_depth,
+        max_depth=max_depth,
+        loss_decode=(),
+    )
+def _make_dinov2_dpt_depther(
+    *,
+    arch_name: str = "vit_large",
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.NYU,
+    depth_range: Optional[Tuple[float, float]] = None,
+    **kwargs,
+):
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+    if depth_range is None:
+        depth_range = _get_depth_range(pretrained, weights)
+    min_depth, max_depth = depth_range
+    backbone = _make_dinov2_model(arch_name=arch_name, pretrained=pretrained, **kwargs)
+    model_name = _make_dinov2_model_name(arch_name, backbone.patch_size)
+    dpt_depth_head = _make_dinov2_dpt_depth_head(embed_dim=backbone.embed_dim, min_depth=min_depth, max_depth=max_depth)
+    out_index = {
+        "vit_small": [2, 5, 8, 11],
+        "vit_base": [2, 5, 8, 11],
+        "vit_large": [4, 11, 17, 23],
+        "vit_giant2": [9, 19, 29, 39],
+    }[arch_name]
+    model = DepthEncoderDecoder(backbone=backbone, decode_head=dpt_depth_head)
+    model.backbone.forward = partial(
+        backbone.get_intermediate_layers,
+        n=out_index,
+        reshape=True,
+        return_class_token=True,
+        norm=False,
+    )
+    model.backbone.register_forward_pre_hook(lambda _, x: CenterPadding(backbone.patch_size)(x[0]))
+    if pretrained:
+        weights_str = weights.value.lower()
+        url = _DINOV2_BASE_URL + f"/{model_name}/{model_name}_{weights_str}_dpt_head.pth"
+        checkpoint = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        if "state_dict" in checkpoint:
+            state_dict = checkpoint["state_dict"]
+        model.load_state_dict(state_dict, strict=False)
+    return model
+def dinov2_vits14_dd(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_dpt_depther(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitb14_dd(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_dpt_depther(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitl14_dd(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_dpt_depther(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitg14_dd(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.NYU, **kwargs):
+    return _make_dinov2_dpt_depther(
+        arch_name="vit_giant2", ffn_layer="swiglufused", pretrained=pretrained, weights=weights, **kwargs
+    )