Update modeling_jamba.py - LoRA support in Mamba (#6)

- Update modeling_jamba.py - LoRA support in Mamba (409c904957803838229e49676ec3958c2205783d)

Files changed (1) hide show

modeling_jamba.py CHANGED Viewed

@@ -943,14 +943,22 @@ class JambaMambaMixer(nn.Module):
             # in order to make quantization work. Quantization code replaces `torch.nn.Linear` layers with quantized
             # linear layers, and requires to call the forward pass directly.
             # The original code here was: ```discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)```
-            dt_proj_bias = self.dt_proj.bias
-            self.dt_proj.bias = None
             discrete_time_step = self.dt_proj(time_step).transpose(1, 2)
-            self.dt_proj.bias = dt_proj_bias
             A = -torch.exp(self.A_log.float())
             # 3.c perform the recurrence y ← SSM(A, B, C)(x)
-            time_proj_bias = self.dt_proj.bias.float() if hasattr(self.dt_proj, "bias") else None
             if cache_params is not None and cache_params.seqlen_offset > 0:
                 scan_outputs = selective_state_update(
                     cache_params.ssm_states[self.layer_idx],

             # in order to make quantization work. Quantization code replaces `torch.nn.Linear` layers with quantized
             # linear layers, and requires to call the forward pass directly.
             # The original code here was: ```discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)```
+            if hasattr(self.dt_proj, "base_layer"):
+                # In case of LoRA, we need to access the base layer to get the weight
+                time_proj_bias = self.dt_proj.base_layer.bias
+                self.dt_proj.base_layer.bias = None
+            else:
+                time_proj_bias = self.dt_proj.bias
+                self.dt_proj.bias = None
             discrete_time_step = self.dt_proj(time_step).transpose(1, 2)
+            if hasattr(self.dt_proj, "base_layer"):
+                self.dt_proj.base_layer.bias = time_proj_bias
+            else:
+                self.dt_proj.bias = time_proj_bias
             A = -torch.exp(self.A_log.float())
             # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+            time_proj_bias = time_proj_bias.float() if time_proj_bias is not None else None
             if cache_params is not None and cache_params.seqlen_offset > 0:
                 scan_outputs = selective_state_update(
                     cache_params.ssm_states[self.layer_idx],