Spaces:

michellemoorre
/

tvar-demo-test

Runtime error

App Files Files Community

tvar-demo-test / dist.py

michellemoorre

Initial commit

6c4dee3 11 months ago

raw

history blame contribute delete

5.31 kB

	"""
	Helpers for distributed training.
	"""
	import os
	import socket

	import torch as th
	import torch.distributed as dist
	from torch.distributed import barrier, is_initialized, broadcast

	# Change this to reflect your cluster layout.
	# The GPU for a given rank is (rank % GPUS_PER_NODE).
	GPUS_PER_NODE = 8

	SETUP_RETRY_COUNT = 3

	import datetime
	import os

	import socket
	from contextlib import closing


	def find_free_port() -> int:
	with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
	s.bind(("", 0))
	s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
	return s.getsockname()[1]


	def check_if_port_open(port: int) -> bool:
	with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
	try:
	s.bind(("", port))
	s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
	return True
	except OSError:
	return False


	def initialized():
	return dist.is_initialized()


	def finalize():
	if dist.is_initialized():
	dist.destroy_process_group()


	def initialize():
	is_mpirun = not (
	"RANK" in os.environ
	and "WORLD_SIZE" in os.environ
	and "MASTER_ADDR" in os.environ
	and "MASTER_PORT" in os.environ
	)

	if is_mpirun:
	from mpi4py import MPI
	import subprocess

	comm = MPI.COMM_WORLD
	rank = comm.Get_rank()
	world_size = comm.Get_size()

	master_addr = None
	master_port = None
	if rank == 0:
	hostname_cmd = ["hostname -I"]
	result = subprocess.check_output(hostname_cmd, shell=True)
	master_addr = result.decode("utf-8").split()[0]

	base_port = os.environ.get(
	"MASTER_PORT", "29500"
	) # TORCH_DISTRIBUTED_DEFAULT_PORT
	if check_if_port_open(int(base_port)):
	master_port = base_port
	else:
	master_port = find_free_port()

	master_addr = comm.bcast(master_addr, root=0)
	master_port = comm.bcast(master_port, root=0)
	# Determine local rank by assuming hostnames are unique
	proc_name = MPI.Get_processor_name()
	all_procs = comm.allgather(proc_name)
	local_rank = sum([i == proc_name for i in all_procs[:rank]])
	uniq_proc_names = set(all_procs)
	host_rank = sorted(uniq_proc_names).index(proc_name)

	os.environ["LOCAL_RANK"] = str(local_rank)
	os.environ["HOST_RANK"] = str(host_rank)
	os.environ["NUM_HOSTS"] = str(len(uniq_proc_names))

	os.environ["RANK"] = str(rank)
	os.environ["WORLD_SIZE"] = str(world_size)
	os.environ["MASTER_ADDR"] = master_addr
	os.environ["MASTER_PORT"] = str(master_port)
	os.environ["OMP_NUM_THREADS"] = "1"

	# Initialize torch distributed
	backend = "gloo" if not th.cuda.is_available() else "nccl"
	dist.init_process_group(backend=backend, timeout=datetime.timedelta(0, 3600))
	th.cuda.set_device(int(os.environ.get('LOCAL_RANK', '0')))

	if is_mpirun and dist.get_rank() == 0:
	print("Distributed setup")
	print("LOCAL_RANK", os.environ['LOCAL_RANK'])
	print("HOST_RANK", os.environ['HOST_RANK'])
	print("NUM_HOSTS", os.environ['NUM_HOSTS'])
	print("WORLD_SIZE", os.environ['WORLD_SIZE'])


	def local_host_gather(data):
	from mpi4py import MPI

	comm = MPI.COMM_WORLD
	host_rank = os.environ["HOST_RANK"]
	all_data = comm.allgather((host_rank, data))
	return [d[1] for d in all_data if d[0] == host_rank]


	def in_distributed_mode():
	return dist is not None


	def is_master():
	return get_rank() == 0


	def is_local_master():
	return get_local_rank() == 0


	def get_rank():
	return dist.get_rank() if in_distributed_mode() else 0


	def get_local_rank():
	return int(os.environ["LOCAL_RANK"])


	def worker_host_idx():
	return int(os.environ["HOST_RANK"])


	def num_hosts():
	return int(os.environ['NUM_HOSTS'])


	def get_world_size():
	return dist.get_world_size() if in_distributed_mode() else 1


	def gpu_visible_device_list():
	return str(dist.get_rank()) if in_distributed_mode() else None


	def get_device():
	"""
	Get the device to use for torch.distributed.
	"""
	if th.cuda.is_available():
	return th.device("cuda")
	return th.device("cpu")


	def sync_params(params):
	"""
	Synchronize a sequence of Tensors across ranks from rank 0.
	"""
	for p in params:
	with th.no_grad():
	dist.broadcast(p, 0)


	def print0(args, *kwargs):
	if get_rank() == 0:
	print(args, *kwargs)


	def allreduce(t: th.Tensor, async_op=False):
	if dist.is_initialized():
	if not t.is_cuda:
	cu = t.detach().cuda()
	ret = dist.all_reduce(cu, async_op=async_op)
	t.copy_(cu.cpu())
	else:
	ret = dist.all_reduce(t, async_op=async_op)
	return ret
	return None


	def allgather(t: th.Tensor, cat=True):
	if dist.is_initialized():
	if not t.is_cuda:
	t = t.cuda()
	ls = [th.empty_like(t) for _ in range(get_world_size())]
	dist.all_gather(ls, t)
	else:
	ls = [t]
	if cat:
	ls = th.cat(ls, dim=0)
	return ls