apriel2 modeling bug

oleksost · oleksost · commit e8b93e0d14d8 · 2026-01-16T20:29:00.000Z
diff --git a/fast_llm_external_models/apriel2/modeling_apriel2.py b/fast_llm_external_models/apriel2/modeling_apriel2.py
@@ -1323,8 +1323,17 @@ def _recurrent_gated_delta_rule(self, query, key, value, g, beta, state):
         """Single-step recurrent update for cached inference.
 
         Input shapes: [batch, seq=1, heads, dim]
-        Need shapes: [batch, heads, dim] for einsum operations
+        State shape: [batch, heads, key_dim, value_dim]
+
+        Implements the delta rule recurrence:
+            1. Decay state: S = S * exp(g)
+            2. Retrieve memory: mem = S @ k
+            3. Compute delta: delta = (v - mem) * beta
+            4. Update state: S = S + k ⊗ delta
+            5. Output: o = S @ q (scaled)
         """
+        input_dtype = query.dtype
+
         # Transpose from [batch, seq, heads, dim] to [batch, heads, seq, dim]
         query = query.transpose(1, 2)
         key = key.transpose(1, 2)
@@ -1334,25 +1343,38 @@ def _recurrent_gated_delta_rule(self, query, key, value, g, beta, state):
         query = _l2norm(query, dim=-1, eps=1e-6)
         key = _l2norm(key, dim=-1, eps=1e-6)
 
+        # Apply query scaling (matches chunked mode)
+        scale = 1.0 / (query.shape[-1] ** 0.5)
+        query = query * scale
+
         # Reshape for computation: [batch, heads, 1, dim] -> [batch, heads, dim]
         query = query.squeeze(2)
         key = key.squeeze(2)
         value = value.squeeze(2)
         g = g.squeeze(1)
         beta = beta.squeeze(1)
 
-        # Update state: S = exp(g) * S + beta * k^T @ v
-        # Keep everything in the same dtype as input (exp() returns float32, need to convert back)
-        input_dtype = query.dtype
+        # 1. Decay state: S = S * exp(g)
         decay = g.exp().to(input_dtype).unsqueeze(-1).unsqueeze(-1)  # [batch, heads, 1, 1]
-        k_outer_v = torch.einsum("bhk,bhv->bhkv", key * beta.unsqueeze(-1), value)
-        state = decay * state + k_outer_v
+        state = state * decay
+
+        # 2. Retrieve memory: mem = S @ k = (S * k.unsqueeze(-1)).sum(dim=-2)
+        # state: [batch, heads, key_dim, value_dim], key: [batch, heads, key_dim]
+        kv_mem = (state * key.unsqueeze(-1)).sum(dim=-2)  # [batch, heads, value_dim]
+
+        # 3. Compute delta: delta = (v - mem) * beta
+        delta = (value - kv_mem) * beta.unsqueeze(-1)  # [batch, heads, value_dim]
+
+        # 4. Update state: S = S + k ⊗ delta
+        # k.unsqueeze(-1): [batch, heads, key_dim, 1]
+        # delta.unsqueeze(-2): [batch, heads, 1, value_dim]
+        state = state + key.unsqueeze(-1) * delta.unsqueeze(-2)
 
-        # Output: o = q @ S
-        output = torch.einsum("bhk,bhkv->bhv", query, state)
-        output = output.unsqueeze(2)  # [batch, heads, 1, v_dim]
+        # 5. Output: o = S @ q = (S * q.unsqueeze(-1)).sum(dim=-2)
+        output = (state * query.unsqueeze(-1)).sum(dim=-2)  # [batch, heads, value_dim]
+        output = output.unsqueeze(2)  # [batch, heads, 1, value_dim]
 
-        # Transpose back to [batch, seq=1, heads, v_dim]
+        # Transpose back to [batch, seq=1, heads, value_dim]
         output = output.transpose(1, 2)
 
         # Ensure state matches output dtype
diff --git a/fast_llm_external_models/tests/test_apriel2/test_mixer_equivalence.py b/fast_llm_external_models/tests/test_apriel2/test_mixer_equivalence.py
@@ -811,6 +811,95 @@ def test_vs_qwen3next(
             msg=f"Apriel2GatedDeltaNet vs Qwen3NextGatedDeltaNet (batch={batch_size}, seq={seq_len})",
         )
 
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GDN requires CUDA")
+    @pytest.mark.parametrize("seed", [42, 123, 456])
+    @pytest.mark.parametrize("prefill_len", [4, 8, 16])
+    def test_chunked_vs_recurrent(
+        self,
+        gdn_config,
+        seed,
+        prefill_len,
+    ):
+        """Verify GDN recurrent mode (decode) matches chunked mode (prefill).
+
+        This tests the inference path: after prefilling N tokens with chunked mode,
+        subsequent single-token decodes using recurrent mode should produce the same
+        output as if we had run the full sequence through chunked mode.
+        """
+        from fast_llm_external_models.apriel2.modeling_apriel2 import Apriel2GatedDeltaNet
+
+        value_heads, key_heads, key_head_dim, value_head_dim = gdn_config
+        hidden_size = 256
+        batch_size = 2
+        total_len = prefill_len + 4  # Prefill + 4 decode steps
+
+        config_dict = {
+            "type": "gdn",
+            "value_heads": value_heads,
+            "key_heads": key_heads,
+            "key_head_dim": key_head_dim,
+            "value_head_dim": value_head_dim,
+            "convolution_layer": {"kernel_size": 4},
+            "norm_eps": 1e-5,
+        }
+
+        # Create model
+        torch.manual_seed(seed)
+        model = Apriel2GatedDeltaNet(hidden_size, config_dict, layer_idx=0)
+        model = model.cuda()
+        model.eval()
+
+        # Create input sequence
+        torch.manual_seed(seed + 1)
+        full_hidden_states = torch.randn(batch_size, total_len, hidden_size, device="cuda")
+
+        # === Reference: Run full sequence through chunked mode ===
+        with torch.no_grad():
+            reference_output = model(full_hidden_states)[0]
+
+        # === Test: Prefill + decode ===
+        # Create a simple cache object to hold conv and recurrent states
+        class SimpleCache:
+            def __init__(self):
+                self.conv_states = {0: None}
+                self.recurrent_states = {0: None}
+
+        cache = SimpleCache()
+
+        # Prefill phase
+        prefill_input = full_hidden_states[:, :prefill_len, :]
+        with torch.no_grad():
+            prefill_output = model(
+                prefill_input,
+                past_key_values=cache,
+                cache_position=torch.arange(prefill_len, device="cuda"),
+            )[0]
+
+        # Decode phase - one token at a time
+        decode_outputs = []
+        for i in range(prefill_len, total_len):
+            decode_input = full_hidden_states[:, i : i + 1, :]
+            with torch.no_grad():
+                decode_output = model(
+                    decode_input,
+                    past_key_values=cache,
+                    cache_position=torch.tensor([i], device="cuda"),
+                )[0]
+            decode_outputs.append(decode_output)
+
+        # Concatenate prefill + decode outputs
+        test_output = torch.cat([prefill_output] + decode_outputs, dim=1)
+
+        # Use looser tolerance for chunked vs recurrent comparison
+        # (different processing order leads to numerical differences)
+        assert_close(
+            test_output,
+            reference_output,
+            rtol=1e-3,
+            atol=1e-3,
+            msg=f"GDN chunked vs recurrent mode (prefill={prefill_len}, total={total_len})",
+        )
+
 
 # =============================================================================
 # SECTION 2: EQUIVALENCE TESTS - KimiDeltaAttention