add swigul yaygit add -u!

karpathy · karpathy · commit 1826752ae140 · 2024-09-25T22:16:45.000Z
diff --git a/llmc/repkv.cuh b/llmc/repkv.cuh
@@ -48,14 +48,14 @@ __global__ void repkv_forward_kernel1(floatX* replicated_qkv,
     replicated_qkv[idx_flat] = __ldcs(&gqa_qkv[inp_idx]);
 }
 
-void repkv_forward(floatX* out, const floatX* inp, int B, int T, int NH, int NH_KV, int HD) {
+void repkv_forward(floatX* out, const floatX* inp, int B, int T, int NH, int NH_KV, int HD, cudaStream_t stream) {
     // NH = number of query heads, NH_KV = number of key and value heads, HD = head dimension
     const int block_size = 128;
     int total_threads = B * T * (3 * NH) * HD; // one thread per output element
     int num_blocks = CEIL_DIV(total_threads, block_size);
     int replicate_factor = NH / NH_KV;
     if (replicate_factor > 1) {
-        repkv_forward_kernel1<<<num_blocks, block_size>>>(out, inp, B, T, NH, replicate_factor, HD);
+        repkv_forward_kernel1<<<num_blocks, block_size, 0, stream>>>(out, inp, B, T, NH, replicate_factor, HD);
     } else {
         cudaMemcpy(out, inp, total_threads * sizeof(floatX), cudaMemcpyDeviceToDevice);
     }
diff --git a/llmc/rope.cuh b/llmc/rope.cuh
@@ -70,14 +70,14 @@ __global__ void rope_forward_kernel1(floatX *out, const floatX *inp, const float
     out[idxi + 1] = x_real * freqs_sin + x_imag * freqs_cos;
 }
 
-void rope_forward(floatX *out, const floatX *inp, const floatX *freqs_cis, int B, int T, int n_head, int head_dim) {
+void rope_forward(floatX *out, const floatX *inp, const floatX *freqs_cis, int B, int T, int n_head, int head_dim, cudaStream_t stream) {
     // the input and output to this kernel are (B, T, 3, NH, HD) where the 3 is q,k,v
     // we are going to launch exactly one thread per element of the output,
     // except divide by two because the work is in "tuples"
     // so this single kernel launch will do RoPE for both q and k, and the threads for v will be a no-op
     const int block_size = 128;
     int total_threads = B * T * 3 * n_head * head_dim / 2;
     int num_blocks = CEIL_DIV(total_threads, block_size);
-    rope_forward_kernel1<<<num_blocks, block_size>>>(out, inp, freqs_cis, B, T, n_head, head_dim);
+    rope_forward_kernel1<<<num_blocks, block_size, 0, stream>>>(out, inp, freqs_cis, B, T, n_head, head_dim);
     cudaCheck(cudaGetLastError());
 }
diff --git a/train_llama3.cu b/train_llama3.cu
@@ -67,6 +67,8 @@ GPT-2 Transformer Neural Net training loop. See README.md for usage.
 #include "llmc/repkv.cuh"
 // defines: precompute_freqs_cis, rope_forward
 #include "llmc/rope.cuh"
+// defines: swiglu_forward
+#include "llmc/swiglu.cuh"
 // ----------- Multi-GPU support -----------
 // defines: ncclFloatX, ncclCheck, MultiGpuConfig, ShardInfo
 // defines: printf0, multi_gpu_config
@@ -252,6 +254,13 @@ void fill_in_activation_sizes(const ActivationTensors* data, TensorSpec (&tensor
     const size_t n_kv_head = config.num_kv_heads; // num key and value heads
     const size_t hd = C / n_head; // the size of each head
     const size_t qkv_channels = (n_head + 2*n_kv_head) * hd; // Q, K, V channels
+    // SwiGLU-related calculation to determine the number of channels here
+    size_t hidden_dim = 4 * C;
+    hidden_dim = (2 * hidden_dim) / 3;
+    hidden_dim = config.ffn_dim_multiplier * hidden_dim;
+    hidden_dim = config.multiple_of * ((hidden_dim + config.multiple_of - 1) / config.multiple_of);
+    size_t ffn_channels = hidden_dim * 2; // c_fc + c_fc2 concatenated
+    size_t ffn_channels_post_gelu = hidden_dim; // swiglu will halve the channels
 
     tensors[0] = TENSOR_SPEC(data->encoded, B * T * C);
     // if recompute >= 1 then we will recompute the layernorm forward activation during backward pass
@@ -270,9 +279,9 @@ void fill_in_activation_sizes(const ActivationTensors* data, TensorSpec (&tensor
     tensors[7] = TENSOR_SPEC(data->ln2, (recompute < 2) ? L * B * T * C : 0);
     tensors[8] = TENSOR_SPEC(data->ln2_mean, L * B * T);
     tensors[9] = TENSOR_SPEC(data->ln2_rstd, L * B * T);
-    tensors[10] = TENSOR_SPEC(data->fch, L * B * T * 4*C);
+    tensors[10] = TENSOR_SPEC(data->fch, L * B * T * ffn_channels);
     // if recompute >= 1 then we will recompute gelu_forward during backward and use this as scratch buffer
-    tensors[11] = TENSOR_SPEC(data->fch_gelu, (recompute < 1) ? L * B * T * 4*C : B * T * 4*C);
+    tensors[11] = TENSOR_SPEC(data->fch_gelu, (recompute < 1) ? L * B * T * ffn_channels_post_gelu : B * T * ffn_channels_post_gelu);
     tensors[12] = TENSOR_SPEC(data->residual3, L * B * T * C);
     tensors[13] = TENSOR_SPEC(data->lnf, B * T * C);
     tensors[14] = TENSOR_SPEC(data->lnf_mean, B * T);
@@ -621,6 +630,12 @@ void gpt2_forward(GPT2 *model, const int* inputs, size_t B, size_t T) {
     const size_t n_kv_head = model->config.num_kv_heads;
     const size_t hd = C / n_head; // head dimension
     const size_t qkv_channels = (n_head + 2*n_kv_head) * hd; // Q, K, V channels
+    size_t hidden_dim = 4 * C;
+    hidden_dim = (2 * hidden_dim) / 3;
+    hidden_dim = model->config.ffn_dim_multiplier * hidden_dim;
+    hidden_dim = model->config.multiple_of * ((hidden_dim + model->config.multiple_of - 1) / model->config.multiple_of);
+    size_t ffn_channels = hidden_dim * 2; // c_fc + c_fc2 concatenated
+    size_t ffn_channels_post_gelu = hidden_dim; // swiglu halves the channels
 
     // validate B,T are not larger than the values used at initialisation
     // (smaller B,T are okay for inference only)
@@ -653,9 +668,9 @@ void gpt2_forward(GPT2 *model, const int* inputs, size_t B, size_t T) {
         floatX* l_attprojw = params.attprojw + l * C * C;
         floatX* l_attprojb = params.attprojb + l * C;
         floatX* l_ln2w = params.ln2w + l * C;
-        floatX* l_fcw = params.fcw + l * 4*C * C;
-        floatX* l_fcb = params.fcb + l * 4*C;
-        floatX* l_fcprojw = params.fcprojw + l * C * 4*C;
+        floatX* l_fcw = params.fcw + l * ffn_channels * C;
+        floatX* l_fcb = params.fcb + l * ffn_channels;
+        floatX* l_fcprojw = params.fcprojw + l * C * ffn_channels_post_gelu;
         floatX* l_fcprojb = params.fcprojb + l * C;
 
         // get the pointers of the activations for this layer
@@ -665,10 +680,10 @@ void gpt2_forward(GPT2 *model, const int* inputs, size_t B, size_t T) {
         floatX* l_residual2 = acts.residual2 + l * B * T * C;
         floatX* l_ln2 = (model->recompute < 2) ? acts.ln2 + l * B * T * C : acts.lnf;
         float* l_ln2_rstd = acts.ln2_rstd + l * B * T;
-        floatX* l_fch = acts.fch + l * B * T * 4*C;
+        floatX* l_fch = acts.fch + l * B * T * ffn_channels;
         // reuse the same activation buffer at each layer, as we'll re-compute the gelu during backward
         // very useful because we dramatically reduce VRAM usage, and may be able to fit larger batch size
-        floatX* l_fch_gelu = (model->recompute < 1) ? acts.fch_gelu + l * B * T * 4*C : acts.fch_gelu;
+        floatX* l_fch_gelu = (model->recompute < 1) ? acts.fch_gelu + l * B * T * ffn_channels_post_gelu : acts.fch_gelu;
         floatX* l_residual3 = acts.residual3 + l * B * T * C;
         floatX* scratch = (floatX*)acts.output; // used for non-cudnn attention, fcproj, attproj, etc.
         floatX* qkv_rep_scratch = (floatX*)acts.scratch_bt4c; // we can use the BT4C scratch for qkv replication
@@ -687,20 +702,23 @@ void gpt2_forward(GPT2 *model, const int* inputs, size_t B, size_t T) {
             // 1) projection to QKV vectors (note k,v may be fewer heads than q)
             matmul_forward_cublaslt(scratch, l_ln1, l_qkvw, l_qkvb, B, T, C, qkv_channels, main_stream);
             // 2) replicate k,v so that all of q,k,v have the same number of heads. done for simplicity, for now
-            repkv_forward(qkv_rep_scratch, scratch, B, T, n_head, n_kv_head, hd);
+            repkv_forward(qkv_rep_scratch, scratch, B, T, n_head, n_kv_head, hd, main_stream);
             // 3) apply RoPE to q,k in place
-            rope_forward(qkv_rep_scratch, qkv_rep_scratch, model->freqs_cis, B, T, n_head, hd);
+            rope_forward(qkv_rep_scratch, qkv_rep_scratch, model->freqs_cis, B, T, n_head, hd, main_stream);
             // 4) attention: att <- softmax(qk^T)v
             attention_forward(l_atty, l_qkvr, l_att, qkv_rep_scratch, B, T, C, NH, main_stream);
         #endif
 
         matmul_forward_cublaslt(scratch, l_atty, l_attprojw, l_attprojb, B, T, C, C, main_stream);
         fused_residual_rmsnorm_forward5(l_residual2, l_ln2, l_ln2_rstd, residual, scratch, l_ln2w, B*T, C, main_stream);
+        matmul_forward_cublaslt(l_fch, l_ln2, l_fcw, l_fcb, B, T, C, ffn_channels, main_stream);
+        swiglu_forward(l_fch_gelu, l_fch, B, T, ffn_channels_post_gelu, main_stream);
+        matmul_forward_cublaslt(scratch, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, ffn_channels_post_gelu, C, main_stream);
 
         // ------------------------------------------------------------------------
         // DEBUGGING: we only work until this point right now, so exit here
         // transfer the first 32 elements to CPU and print them
-        floatX* output = l_ln2;
+        floatX* output = scratch;
         floatX* cpu = (floatX*)mallocCheck(32 * sizeof(floatX));
         cudaCheck(cudaMemcpy(cpu, output, 32 * sizeof(floatX), cudaMemcpyDeviceToHost));
         for (int i = 0; i < 32; i++) {
@@ -716,8 +734,6 @@ void gpt2_forward(GPT2 *model, const int* inputs, size_t B, size_t T) {
         exit(0);
         // ------------------------------------------------------------------------
 
-        matmul_forward_cublaslt(l_fch_gelu, l_ln2, l_fcw, l_fcb, B, T, C, 4*C, main_stream, l_fch, model->gelu_fusion);
-        matmul_forward_cublaslt(scratch, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, 4*C, C, main_stream);
         // OK, fusion across blocks.
         if(l+1 != L) {
             floatX* l_ln1 = (model->recompute < 2) ? acts.ln1 + (l + 1) * B * T * C : acts.lnf;
diff --git a/train_llama3.py b/train_llama3.py
@@ -216,6 +216,11 @@ def __init__(self, config):
 
     def forward(self, x):
         # SwiGLU self.c_proj(F.silu(self.c_fc2(x)) * self.c_fc(x))  <-- 3. difference compared to GPT-2
+        x1 = self.c_fc(x)
+        x2 = self.c_fc2(x)
+        x2 = F.silu(x2)
+        x = x1 * x2
+        x = self.c_proj(x)
 
         # ---------------------------------------------------------------------
         # DEBUGGING: print first 32 elements of x
@@ -227,11 +232,6 @@ def forward(self, x):
         breakpoint()
         # ---------------------------------------------------------------------
 
-        x1 = self.c_fc(x)
-        x2 = self.c_fc2(x)
-        x2 = F.silu(x2)
-        x = x1 * x2
-        x = self.c_proj(x)
         return x
 
 class Block(nn.Module):