ltg
/

norbert4-xsmall

@@ -367,7 +367,7 @@ class SelfAttention(nn.Module):
         theta = 160_000 if (layer_idx + 1) % config.local_global_ratio == 0 else 10_000
         # Initialize rotary embeddings based on whether FlashAttention is available
-        if is_flash_attn_2_available():
             self.rope_embedding = UnpaddedRotaryEmbedding(dim=self.d_qk, base=theta, max_seqlen=config.max_sequence_length)
         else:
             self.rope_embedding = RotaryPositionalEmbeddings(config, theta)
@@ -418,7 +418,7 @@ class SelfAttention(nn.Module):
     def forward(self, hidden_layer: torch.Tensor, qk_layer: torch.Tensor, v1: torch.Tensor | None, padding_info):
         # Get original shape info
-        if is_flash_attn_2_available():
             # Unpadded case
             indices, cu_seqlens, max_seqlen = padding_info
             total_seqlen = hidden_layer.size(0)
@@ -433,7 +433,7 @@ class SelfAttention(nn.Module):
         query, key = self.qk_proj(qk_layer).tensor_split([self.q_out_dim], dim=-1)
         value = self.v_proj(hidden_layer)
-        if is_flash_attn_2_available():
             # Reshape for FlashAttention: (total_seqlen, num_heads, head_dim)
             query = query.view(total_seqlen, self.num_attention_heads, self.d_qk)
             key = key.view(total_seqlen, self.num_kv_heads, self.d_qk)
@@ -645,7 +645,7 @@ class GptBertModel(GptBertPreTrainedModel):
         else:
             attention_mask = attention_mask.bool()
-        if is_flash_attn_2_available():
             if len(attention_mask.size()) != 2:
                 raise ValueError("Bare `attention_mask` med to dimensjoner støttes nå for FlashAttention.")
             with torch.no_grad():
@@ -676,7 +676,7 @@ class GptBertModel(GptBertPreTrainedModel):
             contextualized_embeddings = [layer.to(original_dtype) for layer in contextualized_embeddings]
         # Pad output if using FlashAttention
-        if is_flash_attn_2_available():
             last_layer = _pad_output(last_layer, indices, batch_size, seq_length)
             if output_hidden_states:
                 contextualized_embeddings = [_pad_output(layer, indices, batch_size, seq_length) for layer in contextualized_embeddings]

         theta = 160_000 if (layer_idx + 1) % config.local_global_ratio == 0 else 10_000
         # Initialize rotary embeddings based on whether FlashAttention is available
+        if flash_attn_varlen_qkvpacked_func is not None:
             self.rope_embedding = UnpaddedRotaryEmbedding(dim=self.d_qk, base=theta, max_seqlen=config.max_sequence_length)
         else:
             self.rope_embedding = RotaryPositionalEmbeddings(config, theta)
     def forward(self, hidden_layer: torch.Tensor, qk_layer: torch.Tensor, v1: torch.Tensor | None, padding_info):
         # Get original shape info
+        if flash_attn_varlen_qkvpacked_func is not None:
             # Unpadded case
             indices, cu_seqlens, max_seqlen = padding_info
             total_seqlen = hidden_layer.size(0)
         query, key = self.qk_proj(qk_layer).tensor_split([self.q_out_dim], dim=-1)
         value = self.v_proj(hidden_layer)
+        if flash_attn_varlen_qkvpacked_func is not None:
             # Reshape for FlashAttention: (total_seqlen, num_heads, head_dim)
             query = query.view(total_seqlen, self.num_attention_heads, self.d_qk)
             key = key.view(total_seqlen, self.num_kv_heads, self.d_qk)
         else:
             attention_mask = attention_mask.bool()
+        if flash_attn_varlen_qkvpacked_func is not None:
             if len(attention_mask.size()) != 2:
                 raise ValueError("Bare `attention_mask` med to dimensjoner støttes nå for FlashAttention.")
             with torch.no_grad():
             contextualized_embeddings = [layer.to(original_dtype) for layer in contextualized_embeddings]
         # Pad output if using FlashAttention
+        if flash_attn_varlen_qkvpacked_func is not None:
             last_layer = _pad_output(last_layer, indices, batch_size, seq_length)
             if output_hidden_states:
                 contextualized_embeddings = [_pad_output(layer, indices, batch_size, seq_length) for layer in contextualized_embeddings]