Spaces:

siyan824
/

reloc3r-512_relpose

Running

App Files Files Community

reloc3r-512_relpose / app.py

siyan824

init

b2c1c6e 5 months ago

raw

history blame contribute delete

12.4 kB

	import os
	import numpy as np
	import torch
	import gradio
	import functools
	from reloc3r.utils.image import parse_video, load_images, check_images_shape_format
	from reloc3r.reloc3r_relpose import setup_reloc3r_relpose_model, inference_relpose
	from reloc3r.utils.device import to_numpy
	import cv2
	import trimesh
	import PIL
	from scipy.spatial.transform import Rotation
	from pdb import set_trace as bb

	# from dust3r
	OPENGL = np.array([[1, 0, 0, 0],
	[0, -1, 0, 0],
	[0, 0, -1, 0],
	[0, 0, 0, 1]])

	# func from dust3r
	def geotrf(Trf, pts, ncol=None, norm=False):
	""" Apply a geometric transformation to a list of 3-D points.

	H: 3x3 or 4x4 projection matrix (typically a Homography)
	p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3)

	ncol: int. number of columns of the result (2 or 3)
	norm: float. if != 0, the resut is projected on the z=norm plane.

	Returns an array of projected 2d points.
	"""
	assert Trf.ndim >= 2
	if isinstance(Trf, np.ndarray):
	pts = np.asarray(pts)
	elif isinstance(Trf, torch.Tensor):
	pts = torch.as_tensor(pts, dtype=Trf.dtype)

	# adapt shape if necessary
	output_reshape = pts.shape[:-1]
	ncol = ncol or pts.shape[-1]

	# optimized code
	if (isinstance(Trf, torch.Tensor) and isinstance(pts, torch.Tensor) and
	Trf.ndim == 3 and pts.ndim == 4):
	d = pts.shape[3]
	if Trf.shape[-1] == d:
	pts = torch.einsum("bij, bhwj -> bhwi", Trf, pts)
	elif Trf.shape[-1] == d + 1:
	pts = torch.einsum("bij, bhwj -> bhwi", Trf[:, :d, :d], pts) + Trf[:, None, None, :d, d]
	else:
	raise ValueError(f'bad shape, not ending with 3 or 4, for {pts.shape=}')
	else:
	if Trf.ndim >= 3:
	n = Trf.ndim - 2
	assert Trf.shape[:n] == pts.shape[:n], 'batch size does not match'
	Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1])

	if pts.ndim > Trf.ndim:
	# Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d)
	pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1])
	elif pts.ndim == 2:
	# Trf == (B,d,d) & pts == (B,d) --> (B, 1, d)
	pts = pts[:, None, :]

	if pts.shape[-1] + 1 == Trf.shape[-1]:
	Trf = Trf.swapaxes(-1, -2) # transpose Trf
	pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :]
	elif pts.shape[-1] == Trf.shape[-1]:
	Trf = Trf.swapaxes(-1, -2) # transpose Trf
	pts = pts @ Trf
	else:
	pts = Trf @ pts.T
	if pts.ndim >= 2:
	pts = pts.swapaxes(-1, -2)

	if norm:
	pts = pts / pts[..., -1:] # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG
	if norm != 1:
	pts *= norm

	res = pts[..., :ncol].reshape(*output_reshape, ncol)
	return res

	# func from dust3r
	def add_scene_cam(scene, pose_c2w, edge_color, image=None, focal=None, imsize=None, screen_width=0.11, marker=None):
	if image is not None:
	image = np.asarray(image)
	H, W, THREE = image.shape
	assert THREE == 3
	if image.dtype != np.uint8:
	image = np.uint8(255*image)
	elif imsize is not None:
	W, H = imsize
	elif focal is not None:
	H = W = focal / 1.1
	else:
	H = W = 1

	if isinstance(focal, np.ndarray):
	focal = focal[0]
	if not focal:
	focal = min(H,W) * 1.1 # default value

	# create fake camera
	height = max( screen_width/10, focal * screen_width / H )
	width = screen_width * 0.5**0.5
	rot45 = np.eye(4)
	rot45[:3, :3] = Rotation.from_euler('z', np.deg2rad(45)).as_matrix()
	rot45[2, 3] = -height # set the tip of the cone = optical center
	aspect_ratio = np.eye(4)
	aspect_ratio[0, 0] = W/H
	transform = pose_c2w @ OPENGL @ aspect_ratio @ rot45
	cam = trimesh.creation.cone(width, height, sections=4) # , transform=transform)

	# this is the image
	if image is not None:
	vertices = geotrf(transform, cam.vertices[[4, 5, 1, 3]])
	faces = np.array([[0, 1, 2], [0, 2, 3], [2, 1, 0], [3, 2, 0]])
	img = trimesh.Trimesh(vertices=vertices, faces=faces)
	uv_coords = np.float32([[0, 0], [1, 0], [1, 1], [0, 1]])
	img.visual = trimesh.visual.TextureVisuals(uv_coords, image=PIL.Image.fromarray(image))
	scene.add_geometry(img)

	# this is the camera mesh
	rot2 = np.eye(4)
	rot2[:3, :3] = Rotation.from_euler('z', np.deg2rad(2)).as_matrix()
	vertices = np.r_[cam.vertices, 0.95*cam.vertices, geotrf(rot2, cam.vertices)]
	vertices = geotrf(transform, vertices)
	faces = []
	for face in cam.faces:
	if 0 in face:
	continue
	a, b, c = face
	a2, b2, c2 = face + len(cam.vertices)
	a3, b3, c3 = face + 2*len(cam.vertices)

	# add 3 pseudo-edges
	faces.append((a, b, b2))
	faces.append((a, a2, c))
	faces.append((c2, b, c))

	faces.append((a, b, b3))
	faces.append((a, a3, c))
	faces.append((c3, b, c))

	# no culling
	faces += [(c, b, a) for a, b, c in faces]

	cam = trimesh.Trimesh(vertices=vertices, faces=faces)
	cam.visual.face_colors[:, :3] = edge_color
	scene.add_geometry(cam)

	if marker == 'o':
	marker = trimesh.creation.icosphere(3, radius=screen_width/4)
	marker.vertices += pose_c2w[:3,3]
	marker.visual.face_colors[:,:3] = edge_color
	scene.add_geometry(marker)

	# save relpose to .glb file
	def vis_pose2to1(pose2to1, images):
	poses = [np.identity(4), pose2to1]
	colors = [(255, 0, 0), (0, 0, 255)]
	scene = trimesh.Scene()
	# add each camera
	for i, pose_c2w in enumerate(poses):
	camera_edge_color = colors[i]
	add_scene_cam(scene, pose_c2w, camera_edge_color, images[i])

	# coord transform for vis
	rot = np.eye(4)
	rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
	scene.apply_transform(np.linalg.inv(poses[0] @ OPENGL @ rot))

	path = '_tmp_vis/scene.glb'
	scene.export(file_obj=path)
	print('Scene saved to', path)
	return path

	# draw matches in images
	def vis_ca_match(img1, img2, ca2to1_path, topkq=5, topkk=4):
	img1 = np.ascontiguousarray(img1)
	img2 = np.ascontiguousarray(img2)
	attn_map = np.loadtxt(ca2to1_path)

	h1, w1, _ = img1.shape
	h2, w2, _ = img2.shape
	assert w1 == w2
	hp1, wp1 = h1//16, w1//16
	hp2, wp2 = h2//16, w2//16

	vis = np.concatenate((img1, img2), axis=0)
	alpha = 0.5
	overlay = vis.copy()
	overlay[:,:] = (255, 255, 255)
	cv2.addWeighted(overlay, alpha, vis, 1 - alpha, 0, vis)

	cv2.rectangle(vis, (1, 1), (w1-2, h1-2), color=(255, 0, 0, 255), thickness=1, lineType=cv2.LINE_AA)
	cv2.rectangle(vis, (1, h1+1), (w2-2, h1+h2-2), color=(0, 0, 255, 255), thickness=1, lineType=cv2.LINE_AA)
	colors = [(245, 67, 62, 255), (93, 141, 253, 255), (94, 128, 64, 255), (245, 168, 61, 255), (0, 0, 0, 255)]

	def find_top_k_indices(arr, k):
	sorted_indices = np.argsort(arr)
	top_k_indices = sorted_indices[-k:]
	return top_k_indices

	# select topkq responses
	response_list = []
	for id_v2 in range(attn_map.shape[0]):
	out_of_bound = False
	topk_list = find_top_k_indices(attn_map[id_v2], k=topkk)
	response = attn_map[id_v2][topk_list].mean()
	response_list.append(response)
	for id_v1 in topk_list:
	y_v1 = (id_v1//wp1)*16
	x_v1 = (id_v1%wp1)*16
	if x_v1 ==0 or x_v1 ==w1-16:
	out_of_bound = True
	if y_v1 ==0 or y_v1 ==h1-16:
	out_of_bound = True
	if out_of_bound:
	response_list[-1] = 0
	continue
	top_match_ids = np.argsort(response_list)[-topkq:]

	# draw the responses as matches
	for i in range(len(top_match_ids)):
	id_v2 = top_match_ids[i]
	color = colors[i] if i < len(colors) else colors[-1]

	# query
	y_v2 = (id_v2//wp2)*16
	x_v2 = (id_v2%wp2)*16
	overlay = np.zeros_like(vis)
	cv2.rectangle(vis, (x_v2, y_v2+h1), (x_v2+15, y_v2+h1+15), color=color, thickness=1, lineType=cv2.LINE_AA)

	# keys
	topk_list = find_top_k_indices(attn_map[id_v2], k=topkk)
	for id_v1 in topk_list:
	y_v1 = (id_v1//wp1)*16
	x_v1 = (id_v1%wp1)*16
	cv2.rectangle(vis, (x_v1, y_v1), (x_v1+15, y_v1+15), color=color, thickness=1, lineType=cv2.LINE_AA)

	# lines
	for id_v1 in topk_list:
	y_v1 = (id_v1//wp1)*16
	x_v1 = (id_v1%wp1)*16
	cv2.line(vis, (x_v1+7, y_v1+7), (x_v2+7, y_v2+h1+7), color=color, thickness=1, lineType=cv2.LINE_AA)

	return vis

	# run the whole process
	def run_reloc3r_rpr(reloc3r_relpose, img_reso, device,
	imgs):

	if not len(imgs) == 2:
	print('There are >2 images uploaded, running with the first 2 images...')

	# load images
	print('Loading images...')
	images = load_images(imgs[0:2], size=int(img_reso))
	images = check_images_shape_format(images, device)
	img1 = ((images[0]['img'].detach().cpu().numpy().squeeze().transpose(1,2,0) + 1) / 2 * 255).astype(np.uint8)
	img2 = ((images[1]['img'].detach().cpu().numpy().squeeze().transpose(1,2,0) + 1) / 2 * 255).astype(np.uint8)

	# estimate relpose
	print('Running relative pose estimation...')
	batch = [images[0], images[1]]
	pose2to1 = to_numpy(inference_relpose(batch, reloc3r_relpose, device)[0])
	pose2to1[0:3,3] = pose2to1[0:3,3] / np.linalg.norm(pose2to1[0:3,3]) # normalize the scale to 1 meter
	pose_vis = vis_pose2to1(pose2to1, [img1, img2])
	path = '_tmp_vis/pose2to1.txt'
	np.savetxt(path, pose2to1)
	print('Pose saved to', path)

	# patch matches from cross-attn
	print('Visualizing patch matches...')
	block_id = 5
	head_id = 0
	match_vis = vis_ca_match(img1, img2, '_tmp_vis/_ca_block{}_head{}.txt'.format(block_id, head_id))
	path = '_tmp_vis/match.png'
	cv2.imwrite(path, match_vis[:,:,[2,1,0]])
	print('Match visualization saved to', path)

	return pose_vis, [match_vis]

	# gradio interface
	def main_demo(reloc3r_relpose, device, img_reso, server_name, server_port):
	run = functools.partial(run_reloc3r_rpr, reloc3r_relpose, img_reso, device)
	with gradio.Blocks(css=""".gradio-container {margin: 0 !important; min-width: 100%};""", title="Reloc3r relative camera pose demo") as demo:
	gradio.HTML('<h2 style="text-align: center;">Reloc3r relative camera pose demo</h2>')
	# components
	with gradio.Row():
	with gradio.Column():
	inputfiles = gradio.File(file_count="multiple", file_types=["image"],
	scale=2,
	height=200,
	label="Upload a pair of images")
	run_btn = gradio.Button("Run")
	outmodel = gradio.Model3D(camera_position=(-90, 45, 2),
	height=400,
	label="Camera poses")
	outgallery = gradio.Gallery(preview=True,
	height=400,
	label="Cross-attention responses (top-5 queries with top-4 keys)")
	# events
	run_btn.click(fn=run,
	inputs=[inputfiles],
	outputs=[outmodel, outgallery])
	# demo.launch(share=False, server_name=server_name, server_port=server_port)
	demo.launch()


	if __name__ == '__main__':
	print('Note: This demo runs slowly because it operates on CPU and saves intermediate data.')

	os.environ["GRADIO_TEMP_DIR"] = '_tmp_gradio'
	if not os.path.exists('_tmp_gradio'):
	os.mkdir('_tmp_gradio')
	if not os.path.exists('_tmp_vis'):
	os.mkdir('_tmp_vis')

	server_name = '127.0.0.1'
	server_port = 7867

	img_reso = '512'
	device = 'cpu'
	device = torch.device(device)

	print('Loading Reloc3r-512 RPR model...')
	reloc3r_relpose = setup_reloc3r_relpose_model(model_args=img_reso, device=device)

	main_demo(reloc3r_relpose, device, img_reso, server_name, server_port)