Skip swizzling when read/gather row phases are inconsistent (#699)

harsh-nod · web-flow · commit 74441e343b69 · 2026-01-09T18:44:21.000-08:00
Add validation to detect row phase mismatch between reads and gathers in
gather_to_shared_swizzling and gracefully skip unsupported cases. Add
F32_32x32x16_F16 MMA test for CDNA4 which fails if this validation is
not present.

Signed-off-by: harsh-nod &lt;menonharsh@gmail.com&gt;
diff --git a/lit_tests/kernel/wave/gather_to_shared.py b/lit_tests/kernel/wave/gather_to_shared.py
@@ -329,21 +329,11 @@ def repeat(acc: tkl.Register[M, N, tkl.f32]) -> tkl.Register[M, N, tkl.f32]:
     print(scaled_gemm.asm)
 
     # CHECK-LABEL:    test_gather_to_shared_scaled_dims
-    # CHECK:          #[[map1:.*]] = affine_map<()[s0] -> ((s0 floordiv 8) mod 8)>
-    # CHECK:          #[[map2:.*]] = affine_map<()[s0] -> (s0 mod 8)>
-    # CHECK:          #[[map6:.*]] = affine_map<()[s0] -> ((s0 mod 64) floordiv 16)>
-    # CHECK:          #[[map7:.*]] = affine_map<()[s0] -> ((s0 mod 64) floordiv 16 + 4)>
     # CHECK:          func.func @scaled_gemm
     # CHECK:          %[[thread_id_x:.*]] = gpu.thread_id x
     # CHECK-COUNT-1:    memref.alloc()
-    # Check some swizzling was done
-    # CHECK:          %[[col:.*]] = affine.apply #[[map1]]()[%[[thread_id_x]]]
-    # CHECK:          %[[row:.*]] = affine.apply #[[map2]]()[%[[thread_id_x]]]
-    # CHECK:          %{{.*}} = arith.xori %[[row]], %[[col]] : index
-    # CHECK:          %[[row_swizzled:.*]] = affine.apply #[[map6]]()[%[[thread_id_x]]]
-    # CHECK:          %[[row_swizzled_2:.*]] = affine.apply #[[map7]]()[%[[thread_id_x]]]
-    # CHECK:          %{{.*}} = arith.xori %[[row_swizzled]], %[[row]] : index
-    # CHECK:          %{{.*}} = arith.xori %[[row_swizzled_2]], %[[row]] : index
+    # Note: Swizzling is disabled for this test due to row phase inconsistency
+    # between reads and gathers.
     # CHECK:            scf.for
     # CHECK:              amdgpu.lds_barrier
     # CHECK-COUNT-4:      amdgpu.gather_to_lds {{.*}}
diff --git a/tests/kernel/wave_gemm_mxfp_test.py b/tests/kernel/wave_gemm_mxfp_test.py
@@ -397,31 +397,29 @@ def testScaledBatchedGemmMXFP4Codegen(use_water_backend: bool, tmp_path: Path):
     # We encode the exact registers and wait counts as we want to know if
     # they suddenly change due to backend or upstream MLIR changes.
     if use_water_backend:
-        vgpr_count = 148
+        vgpr_count = 154
         vgpr_spill_count = 0
         sgpr_count = 58
         sgpr_spill_count = 0
         waitcounts = [
             "s_waitcnt lgkmcnt(0)",
             "s_waitcnt vmcnt(0)",
-            "s_waitcnt lgkmcnt(10)",
-            "s_waitcnt lgkmcnt(1)",
-            "s_waitcnt lgkmcnt(0)",
+            "s_waitcnt lgkmcnt(8)",
             "s_waitcnt lgkmcnt(1)",
             "s_waitcnt lgkmcnt(1)",
             "s_waitcnt vmcnt(0) lgkmcnt(0)",
             "s_waitcnt vmcnt(0)",
-            "s_waitcnt lgkmcnt(8)",
+            "s_waitcnt lgkmcnt(7)",
             "s_waitcnt lgkmcnt(6)",
             "s_waitcnt lgkmcnt(5)",
+            "s_waitcnt lgkmcnt(4)",
             "s_waitcnt lgkmcnt(3)",
-            "s_waitcnt lgkmcnt(1)",
             "s_waitcnt lgkmcnt(2)",
             "s_waitcnt lgkmcnt(1)",
             "s_waitcnt lgkmcnt(0)",
         ]
     else:
-        vgpr_count = 162
+        vgpr_count = 160
         vgpr_spill_count = 0
         sgpr_count = 59
         sgpr_spill_count = 0
@@ -430,10 +428,10 @@ def testScaledBatchedGemmMXFP4Codegen(use_water_backend: bool, tmp_path: Path):
             "s_waitcnt vmcnt(0)",
             "s_waitcnt vmcnt(0) lgkmcnt(0)",
             "s_waitcnt vmcnt(0)",
-            "s_waitcnt lgkmcnt(7)",
-            "s_waitcnt lgkmcnt(6)",
             "s_waitcnt lgkmcnt(5)",
+            "s_waitcnt lgkmcnt(4)",
             "s_waitcnt lgkmcnt(3)",
+            "s_waitcnt lgkmcnt(2)",
             "s_waitcnt lgkmcnt(1)",
             "s_waitcnt lgkmcnt(0)",
         ]
diff --git a/tests/kernel/wave_gemm_test.py b/tests/kernel/wave_gemm_test.py
@@ -352,6 +352,7 @@ def testPureGemm(
     [
         pytest.param(MMAType.F32_16x16x16_F16, 64, marks=require_cdna_3_or_4),
         pytest.param(MMAType.F32_32x32x8_F16, 64, marks=require_cdna_3_or_4),
+        pytest.param(MMAType.F32_32x32x16_F16, 64, marks=require_cdna4),
         pytest.param(MMAType.GFX1250_F32_16x16x32_F16, 32, marks=require_gfx1250),
     ],
 )
diff --git a/wave_lang/kernel/wave/gather_to_shared.py b/wave_lang/kernel/wave/gather_to_shared.py
@@ -629,6 +629,21 @@ def gather_to_shared_swizzling(
 
         max_phase = 8
 
+        # Check row phase inconsistency between reads and gathers.
+        gather_local_index = remove_global_indexing(gather.src_index, constraints)
+        read_local_index = remove_global_indexing(read.index, constraints)
+        gather_row_expr = sympy.simplify(
+            subs_idxc(gather_local_index[row_dim].start) % max_phase
+        )
+        read_row_expr = sympy.simplify(
+            subs_idxc(read_local_index[row_dim].start) % max_phase
+        )
+        if gather_row_expr != read_row_expr:
+            logger.info(
+                f"row phase inconsistency between reads and gathers: {gather_row_expr} != {read_row_expr}. Skipping swizzling as it is not supported."
+            )
+            continue
+
         for read in reads:
             index = remove_global_indexing(read.index, constraints)
             col_seq = index[col_dim]

Original file line number	Diff line number	Diff line change
`@@ -352,6 +352,7 @@ def testPureGemm(`
`352`	`352`	`[`
`353`	`353`	`pytest.param(MMAType.F32_16x16x16_F16, 64, marks=require_cdna_3_or_4),`
`354`	`354`	`pytest.param(MMAType.F32_32x32x8_F16, 64, marks=require_cdna_3_or_4),`
	`355`	`+ pytest.param(MMAType.F32_32x32x16_F16, 64, marks=require_cdna4),`
`355`	`356`	`pytest.param(MMAType.GFX1250_F32_16x16x32_F16, 32, marks=require_gfx1250),`
`356`	`357`	`],`
`357`	`358`	`)`