jinaai
/

xlm-roberta-flash-implementation

@@ -56,7 +56,15 @@ class FlashSelfAttention(nn.Module):
                            (default: 0.0)
     """
-    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0, alibi_slopes=None, deterministic=False):
         super().__init__()
         assert flash_attn_varlen_qkvpacked_func is not None, "FlashAttention is not installed"
         assert flash_attn_qkvpacked_func is not None, "FlashAttention is not installed"
@@ -64,6 +72,7 @@ class FlashSelfAttention(nn.Module):
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
         self.register_buffer("alibi_slopes", alibi_slopes, persistent=False)
         self.deterministic = deterministic
     def forward(self, qkv, causal=None, cu_seqlens=None, max_seqlen=None):
@@ -87,6 +96,8 @@ class FlashSelfAttention(nn.Module):
         assert qkv.is_cuda
         causal = self.causal if causal is None else causal
         unpadded = cu_seqlens is not None
         if unpadded:
             assert cu_seqlens.dtype == torch.int32
             assert max_seqlen is not None
@@ -99,6 +110,7 @@ class FlashSelfAttention(nn.Module):
                 softmax_scale=self.softmax_scale,
                 causal=causal,
                 alibi_slopes=self.alibi_slopes,
                 deterministic=self.deterministic,
             )
         else:
@@ -108,6 +120,7 @@ class FlashSelfAttention(nn.Module):
                 softmax_scale=self.softmax_scale,
                 causal=causal,
                 alibi_slopes=self.alibi_slopes,
                 deterministic=self.deterministic,
             )
@@ -123,7 +136,15 @@ class FlashCrossAttention(nn.Module):
                            (default: 0.0)
     """
-    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0, alibi_slopes=None, deterministic=False):
         super().__init__()
         assert flash_attn_varlen_kvpacked_func is not None, "FlashAttention is not installed"
         assert flash_attn_kvpacked_func is not None, "FlashAttention is not installed"
@@ -131,6 +152,7 @@ class FlashCrossAttention(nn.Module):
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
         self.register_buffer("alibi_slopes", alibi_slopes, persistent=False)
         self.deterministic = deterministic
     def forward(
@@ -160,6 +182,8 @@ class FlashCrossAttention(nn.Module):
         assert q.is_cuda and kv.is_cuda
         causal = self.causal if causal is None else causal
         unpadded = cu_seqlens is not None
         if unpadded:
             assert cu_seqlens.dtype == torch.int32
             assert max_seqlen is not None
@@ -179,6 +203,7 @@ class FlashCrossAttention(nn.Module):
                 softmax_scale=self.softmax_scale,
                 causal=causal,
                 alibi_slopes=self.alibi_slopes,
                 deterministic=self.deterministic,
             )
         else:
@@ -192,6 +217,7 @@ class FlashCrossAttention(nn.Module):
                 causal=causal,
                 softmax_scale=self.softmax_scale,
                 alibi_slopes=self.alibi_slopes,
                 deterministic=self.deterministic,
             )
@@ -367,6 +393,7 @@ class MHA(nn.Module):
         rotary_emb_scale_base=None,
         rotary_emb_interleaved=False,
         use_alibi=False,
         fused_bias_fc=False,
         use_flash_attn=False,
         return_residual=False,
@@ -396,6 +423,8 @@ class MHA(nn.Module):
             alibi_slopes = torch.tensor(get_alibi_slopes(num_heads), device=device)
         else:
             alibi_slopes = None
         self.num_heads = num_heads
         self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads
@@ -426,12 +455,12 @@ class MHA(nn.Module):
         )
         wqkv_cls = linear_cls if not self.return_residual else linear_resid_cls
         inner_attn_cls = (
-            partial(FlashSelfAttention, alibi_slopes=alibi_slopes)
             if use_flash_attn
             else SelfAttention
         )
         inner_cross_attn_cls = (
-            partial(FlashCrossAttention, alibi_slopes=alibi_slopes)
             if use_flash_attn
             else CrossAttention
         )
@@ -584,7 +613,6 @@ class MHA(nn.Module):
             assert key_padding_mask is None
             assert self.use_flash_attn
             assert not self.dwconv
-            # assert self.rotary_emb_dim == 0
         if key_padding_mask is not None:
             assert cu_seqlens is None
             assert max_seqlen is None

                            (default: 0.0)
     """
+    def __init__(
+        self,
+        causal=False,
+        softmax_scale=None,
+        attention_dropout=0.0,
+        window_size=(-1, -1),
+        alibi_slopes=None,
+        deterministic=False,
+    ):
         super().__init__()
         assert flash_attn_varlen_qkvpacked_func is not None, "FlashAttention is not installed"
         assert flash_attn_qkvpacked_func is not None, "FlashAttention is not installed"
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
         self.register_buffer("alibi_slopes", alibi_slopes, persistent=False)
+        self.window_size = window_size
         self.deterministic = deterministic
     def forward(self, qkv, causal=None, cu_seqlens=None, max_seqlen=None):
         assert qkv.is_cuda
         causal = self.causal if causal is None else causal
         unpadded = cu_seqlens is not None
+        if self.alibi_slopes is not None:
+            self.alibi_slopes = self.alibi_slopes.to(torch.float32)
         if unpadded:
             assert cu_seqlens.dtype == torch.int32
             assert max_seqlen is not None
                 softmax_scale=self.softmax_scale,
                 causal=causal,
                 alibi_slopes=self.alibi_slopes,
+                window_size=self.window_size,
                 deterministic=self.deterministic,
             )
         else:
                 softmax_scale=self.softmax_scale,
                 causal=causal,
                 alibi_slopes=self.alibi_slopes,
+                window_size=self.window_size,
                 deterministic=self.deterministic,
             )
                            (default: 0.0)
     """
+    def __init__(
+        self,
+        causal=False,
+        softmax_scale=None,
+        attention_dropout=0.0,
+        alibi_slopes=None,
+        window_size=(-1, -1),
+        deterministic=False,
+    ):
         super().__init__()
         assert flash_attn_varlen_kvpacked_func is not None, "FlashAttention is not installed"
         assert flash_attn_kvpacked_func is not None, "FlashAttention is not installed"
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
         self.register_buffer("alibi_slopes", alibi_slopes, persistent=False)
+        self.window_size = window_size
         self.deterministic = deterministic
     def forward(
         assert q.is_cuda and kv.is_cuda
         causal = self.causal if causal is None else causal
         unpadded = cu_seqlens is not None
+        if self.alibi_slopes is not None:
+            self.alibi_slopes = self.alibi_slopes.to(torch.float32)
         if unpadded:
             assert cu_seqlens.dtype == torch.int32
             assert max_seqlen is not None
                 softmax_scale=self.softmax_scale,
                 causal=causal,
                 alibi_slopes=self.alibi_slopes,
+                window_size=self.window_size,
                 deterministic=self.deterministic,
             )
         else:
                 causal=causal,
                 softmax_scale=self.softmax_scale,
                 alibi_slopes=self.alibi_slopes,
+                window_size=self.window_size,
                 deterministic=self.deterministic,
             )
         rotary_emb_scale_base=None,
         rotary_emb_interleaved=False,
         use_alibi=False,
+        window_size=(-1, -1),
         fused_bias_fc=False,
         use_flash_attn=False,
         return_residual=False,
             alibi_slopes = torch.tensor(get_alibi_slopes(num_heads), device=device)
         else:
             alibi_slopes = None
+        if window_size != (-1, -1):
+            assert use_flash_attn, "Local (sliding window) attention code path requires flash_attn"
         self.num_heads = num_heads
         self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads
         )
         wqkv_cls = linear_cls if not self.return_residual else linear_resid_cls
         inner_attn_cls = (
+            partial(FlashSelfAttention, alibi_slopes=alibi_slopes, window_size=window_size)
             if use_flash_attn
             else SelfAttention
         )
         inner_cross_attn_cls = (
+            partial(FlashCrossAttention, alibi_slopes=alibi_slopes, window_size=window_size)
             if use_flash_attn
             else CrossAttention
         )
             assert key_padding_mask is None
             assert self.use_flash_attn
             assert not self.dwconv
         if key_padding_mask is not None:
             assert cu_seqlens is None
             assert max_seqlen is None