Build uploaded using `kernels`.

Files changed (3) hide show

build/torch-cuda/_ops.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import torch
-ops = torch.ops._flash_attn4_c07a63b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn4_c07a63b_dirty::{op_name}"

 import torch
+ops = torch.ops._flash_attn4_474fc55
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn4_474fc55::{op_name}"

build/torch-cuda/flash_bwd_sm100.py CHANGED Viewed

@@ -1544,6 +1544,7 @@ class FlashAttentionBackwardSm100:
             )
             # Dealloc the tensor memory buffer
             tmem.relinquish_alloc_permit()
             tmem.free(tmem_ptr)
         # Compute
@@ -1595,6 +1596,7 @@ class FlashAttentionBackwardSm100:
                 fastdiv_mods,
                 blocksparse_tensors,
             )
         # Reduce
         # (0, 1, 2, 3) - dQ
@@ -1615,6 +1617,7 @@ class FlashAttentionBackwardSm100:
                 mdQ_semaphore,
                 blocksparse_tensors,
             )
         return

             )
             # Dealloc the tensor memory buffer
             tmem.relinquish_alloc_permit()
+            tmem_alloc_barrier.arrive_and_wait()
             tmem.free(tmem_ptr)
         # Compute
                 fastdiv_mods,
                 blocksparse_tensors,
             )
+            tmem_alloc_barrier.arrive()
         # Reduce
         # (0, 1, 2, 3) - dQ
                 mdQ_semaphore,
                 blocksparse_tensors,
             )
+            tmem_alloc_barrier.arrive()
         return

build/torch-cuda/flash_fwd_sm100.py CHANGED Viewed

@@ -1090,6 +1090,7 @@ class FlashAttentionForwardSm100:
             )
             # Dealloc the tensor memory buffer
             tmem.relinquish_alloc_permit()
             tmem.free(tmem_ptr)
         # ///////////////////////////////////////////////////////////////////////////////
@@ -1157,6 +1158,8 @@ class FlashAttentionForwardSm100:
                 if warp_idx < self.correction_warp_ids[0] and warp_idx >= self.softmax1_warp_ids[0]:
                     softmax_loop(stage=1, tStS=tStS)
         # ///////////////////////////////////////////////////////////////////////////////
         #  Correction
         # ///////////////////////////////////////////////////////////////////////////////
@@ -1189,6 +1192,7 @@ class FlashAttentionForwardSm100:
                 TileSchedulerCls,
                 blocksparse_tensors,
             )
         return

             )
             # Dealloc the tensor memory buffer
             tmem.relinquish_alloc_permit()
+            tmem_alloc_barrier.arrive_and_wait()
             tmem.free(tmem_ptr)
         # ///////////////////////////////////////////////////////////////////////////////
                 if warp_idx < self.correction_warp_ids[0] and warp_idx >= self.softmax1_warp_ids[0]:
                     softmax_loop(stage=1, tStS=tStS)
+            tmem_alloc_barrier.arrive()
         # ///////////////////////////////////////////////////////////////////////////////
         #  Correction
         # ///////////////////////////////////////////////////////////////////////////////
                 TileSchedulerCls,
                 blocksparse_tensors,
             )
+            tmem_alloc_barrier.arrive()
         return