|  | import logging | 
					
						
						|  | from abc import ABCMeta, abstractmethod | 
					
						
						|  | from dataclasses import dataclass, replace | 
					
						
						|  | from math import cos, pi, sqrt | 
					
						
						|  | from typing import Any, Dict, List, Optional, Tuple | 
					
						
						|  |  | 
					
						
						|  | import torch | 
					
						
						|  | import torch.distributed as dist | 
					
						
						|  | import torch.nn as nn | 
					
						
						|  | from torch.distributed.fsdp import FullyShardedDataParallel | 
					
						
						|  | from torch.optim.optimizer import Optimizer as OptimizerBase | 
					
						
						|  |  | 
					
						
						|  | from .model import LayerNormBase, BitLinear158 | 
					
						
						|  | from .config import OptimizerType, SchedulerConfig, SchedulerType, TrainConfig | 
					
						
						|  | from .torch_util import get_default_device, is_distributed | 
					
						
						|  |  | 
					
						
						|  | __all__ = [ | 
					
						
						|  | "Optimizer", | 
					
						
						|  | "LionW", | 
					
						
						|  | "AdamW", | 
					
						
						|  | "Scheduler", | 
					
						
						|  | "CosWithWarmup", | 
					
						
						|  | "LinearWithWarmup", | 
					
						
						|  | "InvSqrtWithWarmup", | 
					
						
						|  | "MaxScheduler", | 
					
						
						|  | "ConstantScheduler", | 
					
						
						|  | "BoltOnWarmupScheduler", | 
					
						
						|  | "build_optimizer", | 
					
						
						|  | "build_scheduler", | 
					
						
						|  | ] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | log = logging.getLogger(__name__) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class Optimizer(OptimizerBase): | 
					
						
						|  | def _clean_param_name(self, name: str) -> str: | 
					
						
						|  | return name.replace("_fsdp_wrapped_module.", "") | 
					
						
						|  |  | 
					
						
						|  | @torch.no_grad() | 
					
						
						|  | def clip_grads_and_collect_metrics( | 
					
						
						|  | self, global_step: int, collect_param_metrics: bool = True | 
					
						
						|  | ) -> Dict[str, torch.Tensor]: | 
					
						
						|  | """ | 
					
						
						|  | Clips gradients for every group that has the field `max_grad_norm`. | 
					
						
						|  | At the same time collect metrics for each parameter and its gradient. | 
					
						
						|  | """ | 
					
						
						|  | device = get_default_device() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | per_param_min_metrics: List[torch.Tensor] = [] | 
					
						
						|  | per_param_max_metrics: List[torch.Tensor] = [] | 
					
						
						|  | per_param_sum_metrics: List[torch.Tensor] = [] | 
					
						
						|  | per_param_norm_metrics: List[torch.Tensor] = [] | 
					
						
						|  | per_param_numel_metrics: List[torch.Tensor] = [] | 
					
						
						|  |  | 
					
						
						|  | per_param_min_metric_names: List[str] = [] | 
					
						
						|  | per_param_max_metric_names: List[str] = [] | 
					
						
						|  | per_param_avg_metric_names: List[str] = [] | 
					
						
						|  | per_param_norm_metric_names: List[str] = [] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for group in self.param_groups: | 
					
						
						|  | if is_distributed(): | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | assert group.get("sharded", True) is True | 
					
						
						|  |  | 
					
						
						|  | for name, p in zip(group["param_names"], group["params"]): | 
					
						
						|  | name = self._clean_param_name(name) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | tensors: List[Optional[torch.Tensor]] = [p.grad] | 
					
						
						|  | prefixes: List[str] = [f"grad/{name}"] | 
					
						
						|  | if collect_param_metrics: | 
					
						
						|  | state = self.get_state_for_param(p) | 
					
						
						|  | sorted_state_keys = sorted([k for k in state.keys()]) | 
					
						
						|  | tensors.extend([p] + [state[key] for key in sorted_state_keys]) | 
					
						
						|  | prefixes.extend([f"param/{name}"] + [f"{key}/{name}" for key in sorted_state_keys]) | 
					
						
						|  | assert len(tensors) == len(prefixes) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for x, prefix in zip(tensors, prefixes): | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if x is not None and x.numel() > 0: | 
					
						
						|  | if collect_param_metrics: | 
					
						
						|  | x_abs = x.abs() | 
					
						
						|  | per_param_min_metrics.append(x_abs.min().unsqueeze(0).to(dtype=torch.float32)) | 
					
						
						|  | per_param_max_metrics.append(x_abs.max().unsqueeze(0).to(dtype=torch.float32)) | 
					
						
						|  | per_param_sum_metrics.append(x.sum().unsqueeze(0).to(dtype=torch.float32)) | 
					
						
						|  | per_param_numel_metrics.append( | 
					
						
						|  | torch.tensor([x.numel()], device=device, dtype=torch.float32) | 
					
						
						|  | ) | 
					
						
						|  | per_param_norm_metrics.append( | 
					
						
						|  | torch.linalg.vector_norm(x, 2.0, dtype=torch.float32).unsqueeze(0) | 
					
						
						|  | ) | 
					
						
						|  | else: | 
					
						
						|  | if collect_param_metrics: | 
					
						
						|  | per_param_min_metrics.append( | 
					
						
						|  | torch.tensor([float("inf")], device=device, dtype=torch.float32) | 
					
						
						|  | ) | 
					
						
						|  | per_param_max_metrics.append(torch.tensor([0.0], device=device, dtype=torch.float32)) | 
					
						
						|  | per_param_sum_metrics.append(torch.tensor([0.0], device=device, dtype=torch.float32)) | 
					
						
						|  | per_param_numel_metrics.append(torch.tensor([0.0], device=device, dtype=torch.float32)) | 
					
						
						|  | per_param_norm_metrics.append(torch.tensor([0.0], device=device, dtype=torch.float32)) | 
					
						
						|  | if collect_param_metrics: | 
					
						
						|  | per_param_min_metric_names.append(f"{prefix}.min") | 
					
						
						|  | per_param_max_metric_names.append(f"{prefix}.max") | 
					
						
						|  | per_param_avg_metric_names.append(f"{prefix}.avg") | 
					
						
						|  | per_param_norm_metric_names.append(f"{prefix}.norm") | 
					
						
						|  |  | 
					
						
						|  | assert ( | 
					
						
						|  | len(per_param_min_metrics) | 
					
						
						|  | == len(per_param_min_metric_names) | 
					
						
						|  | == len(per_param_max_metrics) | 
					
						
						|  | == len(per_param_max_metric_names) | 
					
						
						|  | == len(per_param_sum_metrics) | 
					
						
						|  | == len(per_param_numel_metrics) | 
					
						
						|  | == len(per_param_avg_metric_names) | 
					
						
						|  | ) | 
					
						
						|  | assert len(per_param_norm_metrics) == len(per_param_norm_metric_names) | 
					
						
						|  |  | 
					
						
						|  | def is_grad_norm_metric(metric_name: str) -> bool: | 
					
						
						|  | return metric_name.startswith("grad/") and metric_name.endswith(".norm") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | total_grad_norm: torch.Tensor | 
					
						
						|  | per_param_avg_metrics: List[torch.Tensor] = [] | 
					
						
						|  | if is_distributed(): | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if per_param_min_metrics: | 
					
						
						|  | all_mins = torch.cat(per_param_min_metrics).to(device) | 
					
						
						|  | dist.reduce(all_mins, 0, op=dist.ReduceOp.MIN) | 
					
						
						|  | per_param_min_metrics = all_mins.split(1) | 
					
						
						|  |  | 
					
						
						|  | if per_param_max_metrics: | 
					
						
						|  | all_maxs = torch.cat(per_param_max_metrics).to(device) | 
					
						
						|  | dist.reduce(all_maxs, 0, op=dist.ReduceOp.MAX) | 
					
						
						|  | per_param_max_metrics = all_maxs.split(1) | 
					
						
						|  |  | 
					
						
						|  | all_norms = torch.cat(per_param_norm_metrics).to(device) ** 2.0 | 
					
						
						|  | if per_param_sum_metrics and per_param_numel_metrics: | 
					
						
						|  | all_sums = torch.cat(per_param_sum_metrics).to(device) | 
					
						
						|  | all_numels = torch.cat(per_param_numel_metrics).to(device) | 
					
						
						|  | all_sums_norms_numels = torch.cat( | 
					
						
						|  | [all_sums.unsqueeze(0), all_norms.unsqueeze(0), all_numels.unsqueeze(0)], dim=0 | 
					
						
						|  | ) | 
					
						
						|  | dist.all_reduce(all_sums_norms_numels, op=dist.ReduceOp.SUM) | 
					
						
						|  | all_sums, all_norms, all_numels = all_sums_norms_numels.split(1) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | per_param_avg_metrics = (all_sums / all_numels).squeeze(0).split(1) | 
					
						
						|  | else: | 
					
						
						|  | dist.all_reduce(all_norms, op=dist.ReduceOp.SUM) | 
					
						
						|  | grad_norm_metric_mask = torch.tensor( | 
					
						
						|  | [float(is_grad_norm_metric(n)) for n in per_param_norm_metric_names], device=all_norms.device | 
					
						
						|  | ) | 
					
						
						|  | total_grad_norm = (all_norms * grad_norm_metric_mask).sum() ** 0.5 | 
					
						
						|  | per_param_norm_metrics = (all_norms ** (0.5)).squeeze(0).split(1) | 
					
						
						|  | else: | 
					
						
						|  | total_grad_norm = ( | 
					
						
						|  | torch.cat( | 
					
						
						|  | [ | 
					
						
						|  | m | 
					
						
						|  | for m, n in zip(per_param_norm_metrics, per_param_norm_metric_names) | 
					
						
						|  | if is_grad_norm_metric(n) | 
					
						
						|  | ] | 
					
						
						|  | ) | 
					
						
						|  | ** 2.0 | 
					
						
						|  | ).sum() ** 0.5 | 
					
						
						|  | per_param_avg_metrics = [x / n for x, n in zip(per_param_sum_metrics, per_param_numel_metrics)] | 
					
						
						|  |  | 
					
						
						|  | assert len(per_param_avg_metrics) == len(per_param_avg_metric_names) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | all_metrics: Dict[str, torch.Tensor] = {} | 
					
						
						|  | for metric_name, metric in zip(per_param_min_metric_names, per_param_min_metrics): | 
					
						
						|  | all_metrics[metric_name] = metric.squeeze(0) | 
					
						
						|  | for metric_name, metric in zip(per_param_max_metric_names, per_param_max_metrics): | 
					
						
						|  | all_metrics[metric_name] = metric.squeeze(0) | 
					
						
						|  | for metric_name, metric in zip(per_param_avg_metric_names, per_param_avg_metrics): | 
					
						
						|  | all_metrics[metric_name] = metric.squeeze(0) | 
					
						
						|  | for metric_name, metric in zip(per_param_norm_metric_names, per_param_norm_metrics): | 
					
						
						|  | all_metrics[metric_name] = metric.squeeze(0) | 
					
						
						|  | all_metrics["total_grad_norm"] = total_grad_norm | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | num_grads_clipped = 0 | 
					
						
						|  | num_eligible_grads = 0 | 
					
						
						|  | for group in self.param_groups: | 
					
						
						|  | if (max_norm_ratio := group.get("max_grad_norm_ratio")) is not None: | 
					
						
						|  | num_clipped = self._do_adaptive_clipping( | 
					
						
						|  | group, max_norm_ratio, global_step, all_metrics, collect_param_metrics=collect_param_metrics | 
					
						
						|  | ) | 
					
						
						|  | elif (max_norm := group.get("max_grad_norm")) is not None: | 
					
						
						|  | num_clipped = self._do_global_fixed_clipping( | 
					
						
						|  | group, max_norm, all_metrics, collect_param_metrics=collect_param_metrics | 
					
						
						|  | ) | 
					
						
						|  | else: | 
					
						
						|  |  | 
					
						
						|  | continue | 
					
						
						|  | num_eligible_grads += len(group["params"]) | 
					
						
						|  | if num_clipped is not None: | 
					
						
						|  | num_grads_clipped += num_clipped | 
					
						
						|  |  | 
					
						
						|  | if collect_param_metrics: | 
					
						
						|  | if num_eligible_grads > 0: | 
					
						
						|  | clipping_rate = torch.tensor(num_grads_clipped / num_eligible_grads, device="cpu") | 
					
						
						|  | else: | 
					
						
						|  | clipping_rate = torch.tensor(0.0, device="cpu") | 
					
						
						|  | all_metrics["clipping_rate"] = clipping_rate | 
					
						
						|  | return all_metrics | 
					
						
						|  | else: | 
					
						
						|  | return {} | 
					
						
						|  |  | 
					
						
						|  | @torch.no_grad() | 
					
						
						|  | def _do_adaptive_clipping( | 
					
						
						|  | self, | 
					
						
						|  | group: Dict[str, Any], | 
					
						
						|  | max_norm_ratio: float, | 
					
						
						|  | global_step: int, | 
					
						
						|  | all_metrics: Dict[str, torch.Tensor], | 
					
						
						|  | collect_param_metrics: bool = True, | 
					
						
						|  | ) -> Optional[int]: | 
					
						
						|  | """ | 
					
						
						|  | Do adaptive gradient clipping on a param group. | 
					
						
						|  |  | 
					
						
						|  | If ``collect_param_metrics`` is ``True`` this will return the total number of gradients clipped. | 
					
						
						|  | """ | 
					
						
						|  | device = get_default_device() | 
					
						
						|  | num_grads_clipped = 0 | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | beta1, beta2 = group["betas"] | 
					
						
						|  | beta = max(beta1, beta2) | 
					
						
						|  | for name, p in zip(group["param_names"], group["params"]): | 
					
						
						|  | name = self._clean_param_name(name) | 
					
						
						|  | grad_norm = all_metrics.get(f"grad/{name}.norm") | 
					
						
						|  | if grad_norm is None: | 
					
						
						|  | continue | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | state = self.state[p] | 
					
						
						|  | grad_norm_exp_avg = state.get("grad_norm_exp_avg") | 
					
						
						|  | if grad_norm_exp_avg is None: | 
					
						
						|  | grad_norm_exp_avg = grad_norm.clone().to(device) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if global_step > 1: | 
					
						
						|  | state["grad_norm_exp_avg"] = grad_norm_exp_avg | 
					
						
						|  |  | 
					
						
						|  | max_allowed_norm = max_norm_ratio * grad_norm_exp_avg | 
					
						
						|  | clip_coef = max_allowed_norm / (grad_norm + 1e-6) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | clip_coef_clamped = torch.clamp(clip_coef, max=1.0) | 
					
						
						|  | if p.grad is not None: | 
					
						
						|  |  | 
					
						
						|  | p.grad.detach().mul_(clip_coef_clamped.to(p.grad.device, p.grad.dtype)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | grad_norm_exp_avg.lerp_((grad_norm * clip_coef_clamped).to(grad_norm_exp_avg.device), 1 - beta) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if collect_param_metrics: | 
					
						
						|  |  | 
					
						
						|  | if clip_coef_clamped < 1.0: | 
					
						
						|  | num_grads_clipped += 1 | 
					
						
						|  | all_metrics[f"grad_norm_exp_avg/{name}"] = grad_norm_exp_avg | 
					
						
						|  | return num_grads_clipped if collect_param_metrics else None | 
					
						
						|  |  | 
					
						
						|  | @torch.no_grad() | 
					
						
						|  | def _do_global_fixed_clipping( | 
					
						
						|  | self, | 
					
						
						|  | group: Dict[str, Any], | 
					
						
						|  | max_norm: float, | 
					
						
						|  | all_metrics: Dict[str, torch.Tensor], | 
					
						
						|  | collect_param_metrics: bool = True, | 
					
						
						|  | ) -> Optional[int]: | 
					
						
						|  | """ | 
					
						
						|  | Do global fixed gradient clipping on a param group. | 
					
						
						|  |  | 
					
						
						|  | If ``collect_param_metrics`` is ``True`` this will return the total number of gradients clipped. | 
					
						
						|  | """ | 
					
						
						|  | device = get_default_device() | 
					
						
						|  | total_grad_norm = all_metrics["total_grad_norm"] | 
					
						
						|  | clip_coef = max_norm / (total_grad_norm.to(device) + 1e-6) | 
					
						
						|  | clip_coef_clamped = torch.clamp(clip_coef, max=1.0) | 
					
						
						|  | num_grads_clipped: Optional[int] = None | 
					
						
						|  | if collect_param_metrics: | 
					
						
						|  |  | 
					
						
						|  | if clip_coef_clamped < 1.0: | 
					
						
						|  | num_grads_clipped = len(group["params"]) | 
					
						
						|  | for p in group["params"]: | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if p.grad is not None: | 
					
						
						|  |  | 
					
						
						|  | p.grad.detach().mul_(clip_coef_clamped.to(p.grad.device, p.grad.dtype)) | 
					
						
						|  | return num_grads_clipped | 
					
						
						|  |  | 
					
						
						|  | def get_post_step_metrics(self, module: nn.Module) -> Dict[str, torch.Tensor]: | 
					
						
						|  | del module | 
					
						
						|  | return {} | 
					
						
						|  |  | 
					
						
						|  | def get_state_for_param(self, param: nn.Parameter) -> Dict[str, Optional[torch.Tensor]]: | 
					
						
						|  | del param | 
					
						
						|  | return {} | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class LionW(Optimizer): | 
					
						
						|  | """ | 
					
						
						|  | Adapted from https://github.com/google/automl/blob/master/lion/lion_pytorch.py | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | def __init__( | 
					
						
						|  | self, | 
					
						
						|  | params, | 
					
						
						|  | lr: float = 1e-4, | 
					
						
						|  | betas: Tuple[float, float] = (0.9, 0.99), | 
					
						
						|  | weight_decay: float = 0.0, | 
					
						
						|  | ): | 
					
						
						|  | assert lr > 0.0 | 
					
						
						|  | assert all([0.0 <= beta <= 1.0 for beta in betas]) | 
					
						
						|  | defaults = dict(lr=lr, betas=betas, weight_decay=weight_decay) | 
					
						
						|  | super().__init__(params, defaults) | 
					
						
						|  | for group in self.param_groups: | 
					
						
						|  | group["initial_lr"] = group["lr"] | 
					
						
						|  | self._update_total_dot_prod: Optional[torch.Tensor] = None | 
					
						
						|  | self._update_total_norm: Optional[torch.Tensor] = None | 
					
						
						|  | self._signed_update_total_norm: Optional[torch.Tensor] = None | 
					
						
						|  |  | 
					
						
						|  | def get_post_step_metrics(self, module: nn.Module) -> Dict[str, torch.Tensor]: | 
					
						
						|  | update_total_dot_prod = self._update_total_dot_prod | 
					
						
						|  | update_total_norm = self._update_total_norm | 
					
						
						|  | signed_update_total_norm = self._signed_update_total_norm | 
					
						
						|  | if update_total_dot_prod is None or update_total_norm is None or signed_update_total_norm is None: | 
					
						
						|  | return {} | 
					
						
						|  |  | 
					
						
						|  | if is_distributed() and isinstance(module, FullyShardedDataParallel): | 
					
						
						|  |  | 
					
						
						|  | update_total_norm = update_total_norm**2.0 | 
					
						
						|  | signed_update_total_norm = signed_update_total_norm**2.0 | 
					
						
						|  |  | 
					
						
						|  | all_together = torch.stack([update_total_dot_prod, update_total_norm, signed_update_total_norm]) | 
					
						
						|  |  | 
					
						
						|  | dist.reduce(all_together, 0) | 
					
						
						|  | update_total_dot_prod, update_total_norm, signed_update_total_norm = all_together | 
					
						
						|  | update_total_norm = update_total_norm**0.5 | 
					
						
						|  | signed_update_total_norm = signed_update_total_norm**0.5 | 
					
						
						|  |  | 
					
						
						|  | update_cos_sim = update_total_dot_prod / torch.max( | 
					
						
						|  | update_total_norm * signed_update_total_norm, torch.tensor(1e-8, device=get_default_device()) | 
					
						
						|  | ) | 
					
						
						|  | return {"update_cos_sim": update_cos_sim} | 
					
						
						|  |  | 
					
						
						|  | @torch.no_grad() | 
					
						
						|  | def step(self, closure=None) -> None: | 
					
						
						|  | if closure is not None: | 
					
						
						|  | with torch.enable_grad(): | 
					
						
						|  | closure() | 
					
						
						|  |  | 
					
						
						|  | update_total_dot_prod = torch.tensor(0.0, dtype=torch.float32) | 
					
						
						|  | update_norms = [] | 
					
						
						|  | signed_update_norms = [] | 
					
						
						|  |  | 
					
						
						|  | for group in self.param_groups: | 
					
						
						|  | for p in group["params"]: | 
					
						
						|  | if p.grad is None: | 
					
						
						|  | continue | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | p.data.mul_(1 - group["lr"] * group["weight_decay"]) | 
					
						
						|  |  | 
					
						
						|  | grad = p.grad | 
					
						
						|  | state = self.state[p] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if len(state) == 0: | 
					
						
						|  |  | 
					
						
						|  | state["exp_avg"] = torch.zeros_like(p) | 
					
						
						|  |  | 
					
						
						|  | exp_avg = state["exp_avg"] | 
					
						
						|  | beta1, beta2 = group["betas"] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | update = exp_avg * beta1 + grad * (1 - beta1) | 
					
						
						|  | signed_update = torch.sign(update) | 
					
						
						|  | p.add_(signed_update, alpha=-group["lr"]) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | exp_avg.mul_(beta2).add_(grad, alpha=1 - beta2) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | update_total_dot_prod = update_total_dot_prod.to(update.device) | 
					
						
						|  | update_total_dot_prod += torch.tensordot(update, signed_update, dims=len(update.shape)) | 
					
						
						|  | update_norms.append(torch.linalg.vector_norm(update, 2.0, dtype=torch.float32)) | 
					
						
						|  | signed_update_norms.append(torch.linalg.vector_norm(signed_update, 2.0, dtype=torch.float32)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self._update_total_dot_prod = update_total_dot_prod.to(get_default_device()) | 
					
						
						|  | self._update_total_norm = torch.linalg.vector_norm( | 
					
						
						|  | torch.stack(update_norms), | 
					
						
						|  | 2.0, | 
					
						
						|  | dtype=torch.float32, | 
					
						
						|  | ).to(get_default_device()) | 
					
						
						|  | self._signed_update_total_norm = torch.linalg.vector_norm( | 
					
						
						|  | torch.stack(signed_update_norms), | 
					
						
						|  | 2.0, | 
					
						
						|  | dtype=torch.float32, | 
					
						
						|  | ).to(get_default_device()) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class AdamW(torch.optim.AdamW, Optimizer): | 
					
						
						|  | def get_state_for_param(self, param: nn.Parameter) -> Dict[str, Optional[torch.Tensor]]: | 
					
						
						|  | return {key: self.state[param].get(key) for key in ("exp_avg", "exp_avg_sq")} | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | @dataclass | 
					
						
						|  | class Scheduler(metaclass=ABCMeta): | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | grad_clip_warmup_steps: Optional[int] | 
					
						
						|  | grad_clip_warmup_factor: Optional[float] | 
					
						
						|  |  | 
					
						
						|  | @abstractmethod | 
					
						
						|  | def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float: | 
					
						
						|  | raise NotImplementedError | 
					
						
						|  |  | 
					
						
						|  | def _get_max_grad_norm_coeff( | 
					
						
						|  | self, initial_value: Optional[float], step: int, max_steps: int | 
					
						
						|  | ) -> Optional[float]: | 
					
						
						|  | del max_steps | 
					
						
						|  | if initial_value is None: | 
					
						
						|  | return None | 
					
						
						|  | elif ( | 
					
						
						|  | self.grad_clip_warmup_steps is None | 
					
						
						|  | or self.grad_clip_warmup_factor is None | 
					
						
						|  | or step > self.grad_clip_warmup_steps | 
					
						
						|  | ): | 
					
						
						|  | return initial_value | 
					
						
						|  | else: | 
					
						
						|  | return self.grad_clip_warmup_factor * initial_value | 
					
						
						|  |  | 
					
						
						|  | def get_max_grad_norm( | 
					
						
						|  | self, initial_max_grad_norm: Optional[float], step: int, max_steps: int | 
					
						
						|  | ) -> Optional[float]: | 
					
						
						|  | return self._get_max_grad_norm_coeff(initial_max_grad_norm, step, max_steps) | 
					
						
						|  |  | 
					
						
						|  | def get_max_grad_norm_ratio( | 
					
						
						|  | self, initial_max_grad_norm_ratio: Optional[float], step: int, max_steps: int | 
					
						
						|  | ) -> Optional[float]: | 
					
						
						|  | return self._get_max_grad_norm_coeff(initial_max_grad_norm_ratio, step, max_steps) | 
					
						
						|  |  | 
					
						
						|  | def _linear_warmup(self, initial_lr: float, step: int, warmup_steps: int = 2000) -> float: | 
					
						
						|  | return initial_lr * (0.1 + 0.9 * min(step, warmup_steps) / warmup_steps) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | @dataclass | 
					
						
						|  | class CosWithWarmup(Scheduler): | 
					
						
						|  | warmup_steps: int | 
					
						
						|  | alpha_f: float = 0.1 | 
					
						
						|  | t_max: Optional[int] = None | 
					
						
						|  |  | 
					
						
						|  | def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float: | 
					
						
						|  | max_steps = max_steps if self.t_max is None else self.t_max | 
					
						
						|  | eta_min = initial_lr * self.alpha_f | 
					
						
						|  | if step < self.warmup_steps: | 
					
						
						|  | return self._linear_warmup(initial_lr, step, self.warmup_steps) | 
					
						
						|  | elif step >= max_steps: | 
					
						
						|  | return eta_min | 
					
						
						|  | else: | 
					
						
						|  | step = step - self.warmup_steps | 
					
						
						|  | max_steps = max_steps - self.warmup_steps | 
					
						
						|  | return eta_min + (initial_lr - eta_min) * (1 + cos(pi * step / max_steps)) / 2 | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | @dataclass | 
					
						
						|  | class LinearWithWarmup(Scheduler): | 
					
						
						|  | warmup_steps: int | 
					
						
						|  | alpha_f: float = 0.1 | 
					
						
						|  | t_max: Optional[int] = None | 
					
						
						|  |  | 
					
						
						|  | def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float: | 
					
						
						|  | max_steps = max_steps if self.t_max is None else self.t_max | 
					
						
						|  | eta_min = initial_lr * self.alpha_f | 
					
						
						|  | if step < self.warmup_steps: | 
					
						
						|  | return self._linear_warmup(initial_lr, step, self.warmup_steps) | 
					
						
						|  | elif step >= max_steps: | 
					
						
						|  | return eta_min | 
					
						
						|  | else: | 
					
						
						|  | step = step - self.warmup_steps | 
					
						
						|  | max_steps = max_steps - self.warmup_steps | 
					
						
						|  | return initial_lr - (initial_lr - eta_min) * (step / max_steps) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | @dataclass | 
					
						
						|  | class InvSqrtWithWarmup(Scheduler): | 
					
						
						|  | warmup_steps: int | 
					
						
						|  |  | 
					
						
						|  | def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float: | 
					
						
						|  | if step < self.warmup_steps: | 
					
						
						|  | return self._linear_warmup(initial_lr, step, self.warmup_steps) | 
					
						
						|  | del max_steps | 
					
						
						|  | return initial_lr * sqrt(self.warmup_steps / max(self.warmup_steps, step)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | @dataclass | 
					
						
						|  | class MaxScheduler(Scheduler): | 
					
						
						|  | sched1: Scheduler | 
					
						
						|  | sched2: Scheduler | 
					
						
						|  |  | 
					
						
						|  | def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float: | 
					
						
						|  | return max( | 
					
						
						|  | self.sched1.get_lr(initial_lr, step, max_steps), self.sched2.get_lr(initial_lr, step, max_steps) | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | @dataclass | 
					
						
						|  | class BoltOnWarmupScheduler(Scheduler): | 
					
						
						|  | inner: Scheduler | 
					
						
						|  | warmup_start: int | 
					
						
						|  | warmup_end: int | 
					
						
						|  |  | 
					
						
						|  | @classmethod | 
					
						
						|  | def wrap(cls, scheduler: Scheduler, warmup_start: int, warmup_end: int) -> "BoltOnWarmupScheduler": | 
					
						
						|  | return cls( | 
					
						
						|  | grad_clip_warmup_steps=None, | 
					
						
						|  | grad_clip_warmup_factor=None, | 
					
						
						|  | inner=scheduler, | 
					
						
						|  | warmup_start=warmup_start, | 
					
						
						|  | warmup_end=warmup_end, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float: | 
					
						
						|  | if step < self.warmup_start: | 
					
						
						|  | return 0.0 | 
					
						
						|  | if step < self.warmup_end: | 
					
						
						|  | lr_at_intercept = self.inner.get_lr(initial_lr, self.warmup_end, max_steps) | 
					
						
						|  | return lr_at_intercept * (step - self.warmup_start) / (self.warmup_end - self.warmup_start) | 
					
						
						|  | else: | 
					
						
						|  | return self.inner.get_lr(initial_lr, step, max_steps) | 
					
						
						|  |  | 
					
						
						|  | def _get_max_grad_norm_coeff( | 
					
						
						|  | self, initial_value: Optional[float], step: int, max_steps: int | 
					
						
						|  | ) -> Optional[float]: | 
					
						
						|  | return self.inner._get_max_grad_norm_coeff(initial_value, step, max_steps) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | @dataclass | 
					
						
						|  | class ConstantScheduler(Scheduler): | 
					
						
						|  | def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float: | 
					
						
						|  | del step, max_steps | 
					
						
						|  | return initial_lr | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | PARAM_GROUP_FIELDS = ("sharded", "max_grad_norm", "max_grad_norm_ratio", "param_names") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_param_groups(cfg: TrainConfig, model: nn.Module) -> List[Dict[str, Any]]: | 
					
						
						|  | """ | 
					
						
						|  | Separate parameters into weight decay and non weight decay groups. | 
					
						
						|  | """ | 
					
						
						|  | param_groups: List[Dict[str, Any]] | 
					
						
						|  | param_group_defaults = { | 
					
						
						|  | "sharded": isinstance(model, FullyShardedDataParallel), | 
					
						
						|  | "max_grad_norm": cfg.max_grad_norm, | 
					
						
						|  | "max_grad_norm_ratio": cfg.max_grad_norm_ratio, | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | decay = set() | 
					
						
						|  | no_decay = set() | 
					
						
						|  | all_params = {} | 
					
						
						|  | for mn, m in model.named_modules(): | 
					
						
						|  | for pn, p in m.named_parameters(): | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if not p.requires_grad: | 
					
						
						|  | continue | 
					
						
						|  |  | 
					
						
						|  | fpn = f"{mn}.{pn}" if mn else pn | 
					
						
						|  | all_params[fpn] = p | 
					
						
						|  |  | 
					
						
						|  | if pn.endswith("bias"): | 
					
						
						|  | if cfg.optimizer.decay_norm_and_bias: | 
					
						
						|  | decay.add(fpn) | 
					
						
						|  | else: | 
					
						
						|  | no_decay.add(fpn) | 
					
						
						|  | elif pn.endswith("weight") and (isinstance(m, nn.Linear) or isinstance(m, BitLinear158)): | 
					
						
						|  | decay.add(fpn) | 
					
						
						|  | elif pn.endswith("weight") and isinstance(m, (LayerNormBase, nn.LayerNorm)): | 
					
						
						|  | if cfg.optimizer.decay_norm_and_bias: | 
					
						
						|  | decay.add(fpn) | 
					
						
						|  | else: | 
					
						
						|  | no_decay.add(fpn) | 
					
						
						|  | elif pn.endswith("weight") and isinstance(m, nn.Embedding): | 
					
						
						|  | if cfg.optimizer.decay_embeddings: | 
					
						
						|  | decay.add(fpn) | 
					
						
						|  | else: | 
					
						
						|  | no_decay.add(fpn) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | inter_params = decay & no_decay | 
					
						
						|  | union_params = decay | no_decay | 
					
						
						|  | assert len(inter_params) == 0, f"parameters {inter_params} made it into both decay/no_decay sets!" | 
					
						
						|  | assert ( | 
					
						
						|  | len(all_params.keys() - union_params) == 0 | 
					
						
						|  | ), f"parameters {all_params.keys() - union_params} were not separated into either decay/no_decay set!" | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | decay_sorted = sorted(list(decay)) | 
					
						
						|  | no_decay_sorted = sorted(list(no_decay)) | 
					
						
						|  | param_groups = [] | 
					
						
						|  | if len(decay_sorted) > 0: | 
					
						
						|  | param_groups.append( | 
					
						
						|  | { | 
					
						
						|  | "params": [all_params[pn] for pn in decay_sorted], | 
					
						
						|  | "param_names": decay_sorted, | 
					
						
						|  | **param_group_defaults, | 
					
						
						|  | } | 
					
						
						|  | ) | 
					
						
						|  | if len(no_decay_sorted) > 0: | 
					
						
						|  | param_groups.append( | 
					
						
						|  | { | 
					
						
						|  | "params": [all_params[pn] for pn in no_decay_sorted], | 
					
						
						|  | "param_names": no_decay_sorted, | 
					
						
						|  | "weight_decay": 0.0, | 
					
						
						|  | **param_group_defaults, | 
					
						
						|  | } | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for group in param_groups: | 
					
						
						|  | for key in PARAM_GROUP_FIELDS: | 
					
						
						|  | assert key in group | 
					
						
						|  |  | 
					
						
						|  | return param_groups | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def fix_optim_state_dict(optimizer: Optimizer, state_dict: Dict[str, Any]) -> Dict[str, Any]: | 
					
						
						|  | """ | 
					
						
						|  | Make sure old optim state dicts are compatible with new versions. | 
					
						
						|  | """ | 
					
						
						|  | if len(state_dict["param_groups"]) == 1 and len(optimizer.param_groups) == 2: | 
					
						
						|  | assert optimizer.param_groups[1]["weight_decay"] == 0.0 | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | decay_param_group = {k: v for k, v in state_dict["param_groups"][0].items() if k != "params"} | 
					
						
						|  | decay_param_group["params"] = optimizer.state_dict()["param_groups"][0]["params"] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | no_decay_param_group = {k: v for k, v in state_dict["param_groups"][0].items() if k != "params"} | 
					
						
						|  | no_decay_param_group["weight_decay"] = 0.0 | 
					
						
						|  | no_decay_param_group["params"] = optimizer.state_dict()["param_groups"][1]["params"] | 
					
						
						|  |  | 
					
						
						|  | state_dict["param_groups"] = [decay_param_group, no_decay_param_group] | 
					
						
						|  |  | 
					
						
						|  | assert len(optimizer.param_groups) == len(state_dict["param_groups"]) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for group, sd_group in zip(optimizer.param_groups, state_dict["param_groups"]): | 
					
						
						|  | for key in PARAM_GROUP_FIELDS: | 
					
						
						|  | sd_group[key] = group[key] | 
					
						
						|  |  | 
					
						
						|  | return state_dict | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def build_optimizer(cfg: TrainConfig, model: nn.Module) -> Optimizer: | 
					
						
						|  | param_groups = get_param_groups(cfg, model) | 
					
						
						|  | log.info(f"Constructing optimizer with {len(param_groups)} param groups") | 
					
						
						|  | if cfg.optimizer.name == OptimizerType.lionw: | 
					
						
						|  | return LionW( | 
					
						
						|  | param_groups, | 
					
						
						|  | lr=cfg.optimizer.learning_rate, | 
					
						
						|  | betas=cfg.optimizer.betas, | 
					
						
						|  | weight_decay=cfg.optimizer.weight_decay, | 
					
						
						|  | ) | 
					
						
						|  | elif cfg.optimizer.name == OptimizerType.adamw: | 
					
						
						|  | return AdamW( | 
					
						
						|  | param_groups, | 
					
						
						|  | lr=cfg.optimizer.learning_rate, | 
					
						
						|  | betas=cfg.optimizer.betas, | 
					
						
						|  | weight_decay=cfg.optimizer.weight_decay, | 
					
						
						|  | eps=1e-5, | 
					
						
						|  | ) | 
					
						
						|  | else: | 
					
						
						|  | raise NotImplementedError | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def build_scheduler(cfg: TrainConfig, sched_cfg: Optional[SchedulerConfig] = None) -> Scheduler: | 
					
						
						|  | sched_cfg = sched_cfg if sched_cfg is not None else cfg.scheduler | 
					
						
						|  | if sched_cfg.name == SchedulerType.cosine_with_warmup: | 
					
						
						|  | return CosWithWarmup( | 
					
						
						|  | grad_clip_warmup_steps=None | 
					
						
						|  | if sched_cfg.grad_clip_warmup_steps is None | 
					
						
						|  | else int(sched_cfg.grad_clip_warmup_steps), | 
					
						
						|  | grad_clip_warmup_factor=sched_cfg.grad_clip_warmup_factor, | 
					
						
						|  | warmup_steps=int(sched_cfg.t_warmup), | 
					
						
						|  | alpha_f=sched_cfg.alpha_f, | 
					
						
						|  | t_max=None if sched_cfg.t_max is None else int(sched_cfg.t_max), | 
					
						
						|  | ) | 
					
						
						|  | elif sched_cfg.name == SchedulerType.linear_with_warmup: | 
					
						
						|  | return LinearWithWarmup( | 
					
						
						|  | grad_clip_warmup_steps=None | 
					
						
						|  | if sched_cfg.grad_clip_warmup_steps is None | 
					
						
						|  | else int(sched_cfg.grad_clip_warmup_steps), | 
					
						
						|  | grad_clip_warmup_factor=sched_cfg.grad_clip_warmup_factor, | 
					
						
						|  | warmup_steps=int(sched_cfg.t_warmup), | 
					
						
						|  | alpha_f=sched_cfg.alpha_f, | 
					
						
						|  | t_max=None if sched_cfg.t_max is None else int(sched_cfg.t_max), | 
					
						
						|  | ) | 
					
						
						|  | elif sched_cfg.name == SchedulerType.inverse_sqrt_with_warmup: | 
					
						
						|  | return InvSqrtWithWarmup( | 
					
						
						|  | grad_clip_warmup_steps=None | 
					
						
						|  | if sched_cfg.grad_clip_warmup_steps is None | 
					
						
						|  | else int(sched_cfg.grad_clip_warmup_steps), | 
					
						
						|  | grad_clip_warmup_factor=sched_cfg.grad_clip_warmup_factor, | 
					
						
						|  | warmup_steps=int(sched_cfg.t_warmup), | 
					
						
						|  | ) | 
					
						
						|  | elif sched_cfg.name == SchedulerType.max_scheduler: | 
					
						
						|  | return MaxScheduler( | 
					
						
						|  | grad_clip_warmup_steps=None | 
					
						
						|  | if sched_cfg.grad_clip_warmup_steps is None | 
					
						
						|  | else int(sched_cfg.grad_clip_warmup_steps), | 
					
						
						|  | grad_clip_warmup_factor=sched_cfg.grad_clip_warmup_factor, | 
					
						
						|  | sched1=build_scheduler(cfg, replace(sched_cfg, name=SchedulerType.cosine_with_warmup)), | 
					
						
						|  | sched2=build_scheduler(cfg, replace(sched_cfg, name=SchedulerType.inverse_sqrt_with_warmup)), | 
					
						
						|  | ) | 
					
						
						|  | elif sched_cfg.name == SchedulerType.constant: | 
					
						
						|  | return ConstantScheduler( | 
					
						
						|  | grad_clip_warmup_steps=None | 
					
						
						|  | if sched_cfg.grad_clip_warmup_steps is None | 
					
						
						|  | else int(sched_cfg.grad_clip_warmup_steps), | 
					
						
						|  | grad_clip_warmup_factor=sched_cfg.grad_clip_warmup_factor, | 
					
						
						|  | ) | 
					
						
						|  | else: | 
					
						
						|  | raise NotImplementedError | 
					
						
						|  |  |