Arthur-LAGACHERIE
/

RecLlama-code

Model card Files Files and versions

xet

Community

Arthur-LAGACHERIE commited on Mar 24

Commit

f550107

verified ·

1 Parent(s): 0dcd6ba

Update modeling_recllama.py

Browse files

Files changed (1) hide show

modeling_recllama.py +52 -26

modeling_recllama.py CHANGED Viewed

@@ -45,6 +45,7 @@ class RecLlamaConfig(PretrainedConfig):
         coda_layers:int = 2,
         mean_recurrence:int = 12,
         max_backprop_depth:int = 8,
         **kwargs
     ):
         self.vocab_size = vocab_size
@@ -79,6 +80,7 @@ class RecLlamaConfig(PretrainedConfig):
         self.coda_layers = coda_layers
         self.mean_recurrence = mean_recurrence
         self.max_backprop_depth = max_backprop_depth
         self.auto_map = {"AutoModelForCausalLM": "Arthur-LAGACHERIE/RecLlama-code--modeling_recllama.RecLlamaForCausalLM", "AutoConfig":"Arthur-LAGACHERIE/RecLlama-code--modeling_recllama.RecLlamaConfig"}
         super().__init__(
@@ -89,18 +91,6 @@ class RecLlamaConfig(PretrainedConfig):
             **kwargs,
         )
-@dataclass
-class CausalLMOutputRecurrentLatents(ModelOutput):
-    loss: Optional[torch.Tensor] = None
-    log_ppl: Optional[torch.Tensor] = None
-    logits: Optional[torch.Tensor] = None
-    past_key_values: Optional[Cache] = None
-    latent_states: Optional[torch.Tensor] = None
-    hidden_states: Optional[torch.Tensor] = None
-    attention_maps: Optional[dict[int, torch.Tensor]] = None
-    stats: Optional[dict] = None
 class RecDynamicCache(DynamicCache):
@@ -146,7 +136,6 @@ class RecDynamicCache(DynamicCache):
             else:
                 self.key_cache[layer_name] = torch.cat([self.key_cache[layer_name], key_states], dim=-2)
                 self.value_cache[layer_name] = torch.cat([self.value_cache[layer_name], value_states], dim=-2)
         return self.key_cache[layer_name], self.value_cache[layer_name]
@@ -157,7 +146,42 @@ class RecLlamaForCausalLM(LlamaForCausalLM):
         self.prelude_layers = config.prelude_layers
         self.recurrent_layers = config.recurrent_layers
         self.coda_layers = config.coda_layers
     @classmethod
     def from_llama_model(
         cls,
@@ -167,6 +191,7 @@ class RecLlamaForCausalLM(LlamaForCausalLM):
         coda_layers: int,
         mean_recurrence: int = 4,
         max_backprop_depth: int = 6,
     ) -> "RecLlamaForCausalLM":
         """
         Convert a regular LlamaForCausalLM model to a RecLlamaForCausalLM model.
@@ -197,13 +222,14 @@ class RecLlamaForCausalLM(LlamaForCausalLM):
         config.coda_layers = coda_layers
         config.mean_recurrence = mean_recurrence
         config.max_backprop_depth = max_backprop_depth
         rec_model = cls(config)
         rec_model.model.embed_tokens = llama_model.model.embed_tokens
         rec_model.model.norm = llama_model.model.norm
         rec_model.model.layers = llama_model.model.layers
         rec_model.lm_head = llama_model.lm_head
         return rec_model
@@ -224,7 +250,7 @@ class RecLlamaForCausalLM(LlamaForCausalLM):
         num_steps: int = None,
         **kwargs: Unpack[KwargsForCausalLM],
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -243,7 +269,7 @@ class RecLlamaForCausalLM(LlamaForCausalLM):
             cache_position = torch.arange(
                 past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
@@ -275,15 +301,15 @@ class RecLlamaForCausalLM(LlamaForCausalLM):
         # recurrent block
         inputs_embeds = self.iterate_forward(
-            inputs_embeds,
-            causal_mask,
-            position_ids,
-            past_key_values,
-            output_attentions,
-            use_cache,
-            cache_position,
-            position_embeddings,
-            num_steps
         )
         # coda blocks
@@ -402,7 +428,7 @@ class RecLlamaForCausalLM(LlamaForCausalLM):
             mu = math.log(t) - (sigma**2 / 2)
             rate = torch.zeros((1,), dtype=torch.float).log_normal_(mean=mu, std=sigma)
             n = torch.poisson(rate) + 1  # Corrected Poisson sampling
-            n = torch.clamp(n, min=0)  # Ensure non-negative
             k = torch.clamp(n, max=self.config.max_backprop_depth)  # Limit k properly
         else:
             n = torch.tensor(self.config.mean_recurrence, dtype=torch.long)

         coda_layers:int = 2,
         mean_recurrence:int = 12,
         max_backprop_depth:int = 8,
+        max_recurrence:int = 18,
         **kwargs
     ):
         self.vocab_size = vocab_size
         self.coda_layers = coda_layers
         self.mean_recurrence = mean_recurrence
         self.max_backprop_depth = max_backprop_depth
+        self.max_recurrence = max_recurrence
         self.auto_map = {"AutoModelForCausalLM": "Arthur-LAGACHERIE/RecLlama-code--modeling_recllama.RecLlamaForCausalLM", "AutoConfig":"Arthur-LAGACHERIE/RecLlama-code--modeling_recllama.RecLlamaConfig"}
         super().__init__(
             **kwargs,
         )
 class RecDynamicCache(DynamicCache):
             else:
                 self.key_cache[layer_name] = torch.cat([self.key_cache[layer_name], key_states], dim=-2)
                 self.value_cache[layer_name] = torch.cat([self.value_cache[layer_name], value_states], dim=-2)
         return self.key_cache[layer_name], self.value_cache[layer_name]
         self.prelude_layers = config.prelude_layers
         self.recurrent_layers = config.recurrent_layers
         self.coda_layers = config.coda_layers
+        for i in range(len(self.model.layers)):
+            self.model.layers[i].self_attn.k_proj.bias = nn.Parameter(torch.randn(1, self.model.layers[i].self_attn.k_proj.out_features)) #nn.Parameter(torch.full((1, self.model.layers[i].self_attn.k_proj.out_features), k_bias_value))
+            self.model.layers[i].self_attn.q_proj.bias = nn.Parameter(torch.randn(1, self.model.layers[i].self_attn.q_proj.out_features))
+    def get_recurrent_params(self):
+        recurrent_params = []
+        # Get indices of recurrent layers
+        recurrent_start = self.prelude_layers
+        recurrent_end = self.prelude_layers + self.recurrent_layers
+        # Extract parameters from recurrent layers
+        for layer_idx in range(recurrent_start, recurrent_end):
+            layer = self.model.layers[layer_idx]
+            for param_name, param in layer.named_parameters():
+                recurrent_params.append(param)
+        return sum(p.numel() for p in recurrent_params)
+    def get_param_count(self):
+        return sum(p.numel() for p in self.parameters())
+    def add_bias(self, q_bias_value=0.1, k_bias_value=0.1):
+        for i in range(len(self.model.layers)):
+            self.model.layers[i].self_attn.k_proj.bias = nn.Parameter(torch.randn(1, self.model.layers[i].self_attn.k_proj.out_features)) #nn.Parameter(torch.full((1, self.model.layers[i].self_attn.k_proj.out_features), k_bias_value))
+            self.model.layers[i].self_attn.q_proj.bias = nn.Parameter(torch.randn(1, self.model.layers[i].self_attn.q_proj.out_features))
+    @staticmethod
+    def add_bias_to_model(model, q_bias_value=0.1, k_bias_value=0.1):
+        for i in range(len(model.model.layers)):
+            model.model.layers[i].self_attn.k_proj.bias = nn.Parameter(torch.zeros(1, model.model.layers[i].self_attn.k_proj.out_features))
+            model.model.layers[i].self_attn.q_proj.bias = nn.Parameter(torch.zeros(1, model.model.layers[i].self_attn.q_proj.out_features))
+        return model
     @classmethod
     def from_llama_model(
         cls,
         coda_layers: int,
         mean_recurrence: int = 4,
         max_backprop_depth: int = 6,
+        max_recurrence: int = 8,
     ) -> "RecLlamaForCausalLM":
         """
         Convert a regular LlamaForCausalLM model to a RecLlamaForCausalLM model.
         config.coda_layers = coda_layers
         config.mean_recurrence = mean_recurrence
         config.max_backprop_depth = max_backprop_depth
+        config.max_recurrence = max_recurrence
         rec_model = cls(config)
         rec_model.model.embed_tokens = llama_model.model.embed_tokens
         rec_model.model.norm = llama_model.model.norm
         rec_model.model.layers = llama_model.model.layers
         rec_model.lm_head = llama_model.lm_head
+        rec_model = RecLlamaForCausalLM.add_bias_to_model(rec_model)
         return rec_model
         num_steps: int = None,
         **kwargs: Unpack[KwargsForCausalLM],
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
             cache_position = torch.arange(
                 past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
         # recurrent block
         inputs_embeds = self.iterate_forward(
+            inputs_embeds=inputs_embeds,
+            attention_mask=causal_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            num_steps=num_steps
         )
         # coda blocks
             mu = math.log(t) - (sigma**2 / 2)
             rate = torch.zeros((1,), dtype=torch.float).log_normal_(mean=mu, std=sigma)
             n = torch.poisson(rate) + 1  # Corrected Poisson sampling
+            n = torch.clamp(n, min=0, max=self.config.max_recurrence)  # Ensure non-negative
             k = torch.clamp(n, max=self.config.max_backprop_depth)  # Limit k properly
         else:
             n = torch.tensor(self.config.mean_recurrence, dtype=torch.long)