Merge pull request #811 from ngc92/llama-fixes

karpathy · web-flow · commit d36f0e6756f6 · 2025-05-10T16:27:57.000-07:00
Llama fixes
diff --git a/.github/workflows/ci_gpu.yml b/.github/workflows/ci_gpu.yml
@@ -104,20 +104,14 @@ jobs:
           git clone https://github.com/NVIDIA/cudnn-frontend.git
 
       - name: Build with cuDNN
-        run: USE_CUDNN=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu
+        run: USE_CUDNN=1 make test_gpt2cu train_gpt2cu
 
       - name: Train model with cuDNN
         run: ./train_gpt2cu
 
-      - name: Train model fp32 with cuDNN
-        run: ./train_gpt2fp32cu
-
       - name: Execute testing program with cuDNN
         run: ./test_gpt2cu
 
-      - name: Execute testing program fp32 with cuDNN
-        run: ./test_gpt2fp32cu
-
   build-and-test-llama3:
     runs-on: ubicloud-gpu-standard-1-latest
     env:
@@ -137,7 +131,9 @@ jobs:
         run: python dev/data/tinyshakespeare.py --model_desc llama-3
 
       - name: Train model
-        run: python train_llama3.py --write_tensors 1 --dtype float32 --offload 1
+        # use the first 10 layers, so that everything fits into the 20GB of
+        # the A4000 Ada that we have in CI
+        run: python train_llama3.py --write_tensors 1 --dtype float32 --depth 10
 
       - name: Build FP32 precision
         run: PRECISION=FP32 make test_llama3cu
diff --git a/llmc/attention.cuh b/llmc/attention.cuh
@@ -263,7 +263,7 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* datt, floatX* scrat
     matmul_cublaslt(dv, scratch, att, nullptr, HS, T, T, stream, false, true, B * NH, T * HS, T * T, T * HS);
     const float scale = 1.0f / sqrtf((float)HS);
     // backward into preatt. this is an in-place operation; datt turns into dpreatt here
-    softmax_autoregressive_backward_inplace_kernel<<<dim3(T / 4, B * NH), 256>>>(datt, att, B, T, C, scale);
+    softmax_autoregressive_backward_inplace_kernel<<<dim3(T / 4, B * NH), 256, 0, stream>>>(datt, att, B, T, C, scale);
     const floatX* dpreatt = datt;
     // backward into q
     matmul_cublaslt(dq, k, dpreatt, nullptr, HS, T, T, stream, false, false, B * NH, T * HS, T * T, T * HS);
diff --git a/llmc/repkv.cuh b/llmc/repkv.cuh
@@ -50,7 +50,7 @@ __global__ void repkv_forward_kernel1(floatX* replicated_qkv,
 
 __global__ void repkv_backward_kernel1(floatX* dinp, const floatX* dout,
                                 int B, int N, int NH, int replicate_factor, int HD) {
-    // we have a single tensor dout of shapae of (B, N 3 * NH * HD)
+    // we have a single tensor dout of shape of (B, N 3 * NH * HD)
     // we want to reduce sum (for K and V) into  (B, N, (NH + 2*(NH/replicate_factor)) * HD)
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= B * N * 3 * NH * HD) { return;}
@@ -111,11 +111,11 @@ void repkv_forward(floatX* out, const floatX* inp, int B, int T, int NH, int NH_
 }
 
 void repkv_backward(floatX* dinp, const floatX* dout,
-                    const int B, const int T, const int NH, const int NH_KV, const int d) {
+                    const int B, const int T, const int NH, const int NH_KV, const int d, cudaStream_t stream) {
     const int block_size = 128;
     int total_threads = B * T * (3 * NH) * d;
     int num_blocks = CEIL_DIV(total_threads, block_size);
     int replicate_factor = NH / NH_KV;
-    repkv_backward_kernel1<<<num_blocks, block_size>>>(dinp, dout, B, T, NH, replicate_factor, d);
+    repkv_backward_kernel1<<<num_blocks, block_size, 0, stream>>>(dinp, dout, B, T, NH, replicate_factor, d);
     cudaCheck(cudaGetLastError());
 }
diff --git a/requirements.txt b/requirements.txt
@@ -4,5 +4,4 @@ torch
 tiktoken
 transformers
 datasets
-requests
-torchao
+requests
diff --git a/test_llama3.cu b/test_llama3.cu
@@ -128,17 +128,20 @@ int main(int argc, char *argv[]) {
     int state_header[256];
     freadCheck(state_header, sizeof(int), 256, state_file);
     if (state_header[0] != 20240803) { fprintf(stderr, "Bad magic state file\n"); exit(EXIT_FAILURE); }
-    if (state_header[1] != 2) {
+    if (state_header[1] != 3) {
         fprintf(stderr, "Bad version in state file: %d\n", state_header[1]);
         fprintf(stderr, "---> HINT: try to re-run `python train_llama3.py`\n");
         exit(EXIT_FAILURE);
     }
     int B = state_header[2]; // batch size, e.g. 4
     int T = state_header[3]; // time / sequence length (e.g. 64, up to maxT)
+    int steps = state_header[4];
+    float* expected_losses = (float*) malloc(steps * sizeof(float));
     assert(0 <= T && T <= maxT);
     printf("[State]\n");
     printf("batch_size: %d\n", B);
     printf("seq_len: %d\n", T);
+    printf("steps: %d\n", steps);
 
     set_zero_configs(&multi_gpu_config, 0, model.num_parameters);
 
@@ -157,6 +160,7 @@ int main(int argc, char *argv[]) {
     FloatParameterTensors expected_grads; // will be read from file. right now: all in fp32
     float* expected_grads_memory = float_cpu_malloc_and_point_parameters(&expected_grads, model.param_elements);
     freadCheck(expected_grads_memory, sizeof(float), model.num_parameters, state_file);
+    freadCheck(expected_losses, sizeof(float), steps, state_file);
     fcloseCheck(state_file);
 
     // this memory will be used to do one single copy of all (mixed precision) GPU grads to CPU grads
@@ -290,27 +294,13 @@ int main(int argc, char *argv[]) {
         llama3_update(&model, 1e-5f, 0.9f, 0.95f, 1e-8f, 0.0f, grad_scale, step+1, &multi_gpu_config);
 
         // print the timing information at the end
-        printf("step %d: loss %f (took %f ms)\n", step+1, model.mean_loss, time_elapsed_s * 1000);
+        printf("step %d: loss %f norm %f (took %f ms)\n", step+1, model.mean_loss, grad_norm, time_elapsed_s * 1000);
         // the expected losses from PyTorch were copied over after the print formatting rounded
         // them to 6 decimal places, so we do the same here
         float rounded_loss = roundf(model.mean_loss * 1000000) / 1000000;
         losses[step] = rounded_loss;
     }
 
-    // expected losses are as follows, from Python (without CPUOffload)
-    float expected_losses[10] = {
-        4.849688f,
-        3.070303f,
-        1.711614f,
-        1.056311f,
-        0.593335f,
-        0.428291f,
-        0.372275f,
-        0.360507f,
-        0.355562f,
-        0.334824f
-    };
-
     // compare
     for (int i = 0; i < 10; i++) {
         if (fabsf(losses[i] - expected_losses[i]) >= loss_diff_threshold) {
@@ -377,6 +367,7 @@ int main(int argc, char *argv[]) {
     common_free(model);
     free(x);
     free(y);
+    free(expected_losses);
     free(logits_cpu_raw);
     free(logits_cpu);
     free(expected_logits);
diff --git a/train_llama3.cu b/train_llama3.cu
@@ -923,7 +923,7 @@ void llama3_backward_and_reduce(LLama3 *model, int* inputs, const int* targets,
         // backward rope (this can be done in-place)
         rope_backward_inplace(dl_bt4c, dl_bt4c, model->freqs_cis, B, T, NH, hd, main_stream);
         // backward repkv (use scratchX as gradient buffer here)
-        repkv_backward(dl_bt4c2, dl_bt4c, B, T, NH, n_kv_head, hd);
+        repkv_backward(dl_bt4c2, dl_bt4c, B, T, NH, n_kv_head, hd, main_stream);
         // backward QKV projection
         if(model->recompute >= 2) {
             rmsnorm_forward(l_ln1, l_ln1_rstd, residual, l_ln1w, B, T, C, main_stream);
diff --git a/train_llama3.py b/train_llama3.py
@@ -472,7 +472,7 @@ def from_pretrained_llama3_meta(cls, ckpt_dir, tokenizer_path):
         model.tokenizer = tokenizer
         return model
 
-    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type, zero_stage, offload):
+    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type, zero_stage):
         # start with all of the candidate parameters
         param_dict = {pn: p for pn, p in self.named_parameters()}
         # filter out those that do not require grad
@@ -494,14 +494,10 @@ def configure_optimizers(self, weight_decay, learning_rate, betas, device_type,
         use_fused = fused_available and device_type == 'cuda'
         print0(f"using fused AdamW: {use_fused}")
         if zero_stage == 1:
-            assert not offload
             print0("using ZeroRedundancyOptimizer")
             optimizer = ZeroRedundancyOptimizer(**optim_groups[0], optimizer_class=torch.optim.AdamW,
                                                 lr=learning_rate, betas=betas, fused=use_fused)
             optimizer.add_param_group(optim_groups[1])
-        elif offload:
-            from torchao.prototype.low_bit_optim import CPUOffloadOptimizer
-            optimizer = CPUOffloadOptimizer(optim_groups, torch.optim.AdamW, lr=learning_rate, betas=betas, fused=use_fused)
         else:
             print0("using regular AdamW")
             optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, fused=use_fused)
@@ -980,9 +976,10 @@ def write_state(model, x, y, logits, loss, filename):
     # this can be used for checking the computation correctness in C
     header = torch.zeros(256, dtype=torch.int32)
     header[0] = 20240803 # magic
-    header[1] = 2 # version
+    header[1] = 3 # version
     header[2] = x.size(0) # batch size of the batch, B
     header[3] = x.size(1) # temporal extent of the batch, T
+    header[4] = 0
     grads = {name: param.grad.cpu() for name, param in model.named_parameters()}
     with open(filename, "wb") as file:
         # header
@@ -999,6 +996,22 @@ def write_state(model, x, y, logits, loss, filename):
         write_tensors(grads, model.config.n_layer, file, "float32")
     print(f"wrote {filename}")
 
+
+def write_training_history(losses, norms, filename):
+    # amends the state file with the sequence of losses and grad norms
+    assert len(norms) == len(losses)
+    with open(filename, "r+b") as f:
+        header = np.frombuffer(f.read(256*4), dtype=np.int32).copy()
+        header[4] = len(losses)
+        f.seek(0, os.SEEK_SET)
+        f.write(header.tobytes())
+        f.seek(0, os.SEEK_END)
+        # write the losses and norms at the end of the file
+        f.write(np.asarray(losses).astype(np.float32).tobytes())
+        f.write(np.asarray(norms).astype(np.float32).tobytes())
+
+    print(f"updated {filename}")
+
 # -----------------------------------------------------------------------------
 # int main
 
@@ -1022,6 +1035,7 @@ def print0(*args, **kwargs):
     parser.add_argument("--input_val_bin", type=str, default="", help="input .bin to eval validation loss on")
     parser.add_argument("--output_dir", type=str, default="", help="output directory to which to write logs and checkpoints")
     parser.add_argument("--model", type=str, default="meta-llama/Llama-3.2-1B", help="chose the llama model")
+    parser.add_argument("--depth", type=int, default=-1, help="load only a subset of the model's layers")
     # token layout for each step of the optimization
     parser.add_argument("--batch_size", type=int, default=4, help="batch size, in units of #batch dimensions")
     parser.add_argument("--sequence_length", type=int, default=64, help="sequence length")
@@ -1048,7 +1062,6 @@ def print0(*args, **kwargs):
     parser.add_argument("--compile", type=int, default=0, help="torch.compile the model")
     parser.add_argument("--dtype", type=str, default="bfloat16", help="float32|float16|bfloat16")
     parser.add_argument("--zero_stage", type=int, default=0, help="zero redundancy optimizer stage (0/1/2/3)")
-    parser.add_argument("--offload", type=int, default=0, help="offload optimizer to CPU")
     # python -> C bridge
     parser.add_argument("--write_tensors", type=int, default=0, help="write tensors to disk")
     args = parser.parse_args()
@@ -1133,9 +1146,16 @@ def print0(*args, **kwargs):
         assert args.tokenizer_path is not None and os.path.exists(args.tokenizer_path), f"llama3 tokenizer path {args.tokenizer_path} does not exist"
         model = LLaMA.from_pretrained_llama3_meta(args.ckpt_dir, args.tokenizer_path)
 
-    # convert the model to the desired precision
-    if args.dtype == "float32":
-        model = model.to(torch.float32)
+    if args.depth > 0:
+        assert args.depth < len(model.transformer.h), f"invalid depth {args.depth}, model has {len(model.transformer.h)} blocks"
+        model.transformer.h = model.transformer.h[0:args.depth]
+        model.config.n_layer = args.depth
+
+    # PT optimizer doesn't do stochastic rounding, so we
+    # really want the model to be in fp32 precision:
+    # --dtype should only enable AMP
+    # as the original checkpoints are in 16 bit, we need to convert
+    model = model.to(torch.float32)
 
     model = model.to(device)
     model.train()
@@ -1185,7 +1205,7 @@ def print0(*args, **kwargs):
     # init the optimizer
     optimizer = raw_model.configure_optimizers(weight_decay=args.weight_decay,
                                                learning_rate=args.learning_rate, betas=(0.9, 0.95),
-                                               device_type=device, zero_stage=zero_stage, offload=args.offload)
+                                               device_type=device, zero_stage=zero_stage)
 
     # learning rate decay scheduler (cosine with warmup)
     def get_lr(it):
@@ -1205,6 +1225,8 @@ def get_lr(it):
     if device == "cuda":
         torch.cuda.reset_peak_memory_stats()
     timings = []
+    losses = []
+    norms = []
     norm = -1.0   # dummy value to print in inference-only mode
     for step in range(args.num_iterations + 1):
         t0 = time.time()
@@ -1298,16 +1320,6 @@ def get_lr(it):
             dist.all_reduce(lossf, op=dist.ReduceOp.AVG)
         lossf = lossf.item()
         norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
-        if args.offload:
-            # CPUOffloadOptimizer is *not* compatible with gradient clipping and will *silently*
-            # give wrong results. So we
-            # a) explicitly wait for it to finish its gradients transfers
-            # b) overwrite the CPU gradients with the clipped GPU gradients.
-            # This is terribly inefficient, but correct and lets us run CI on
-            # small(ish) GPUs
-            torch.cuda.synchronize()
-            for gpu, cpu in optimizer.param_d2h_map.items():
-                cpu.grad[...] = gpu.grad[...]
 
         # determine and set the learning rate for this iteration
         lr = get_lr(step)
@@ -1327,6 +1339,8 @@ def get_lr(it):
         t1 = time.time()
         # the 0th iteration is often an outlier (much slower) => skip logging it
         tokens_per_second = grad_accum_steps * ddp_world_size * B * T / (t1-t0)
+        losses.append(lossf)
+        norms.append(norm.item())
         print0(f"step {step+1:4d}/{args.num_iterations} | train loss {lossf:.6f} | norm {norm:.4f} | lr {lr:.2e} | ({(t1-t0)*1000:.2f} ms | {tokens_per_second:.0f} tok/s)")
         # log to logile
         if master_process and logfile is not None:
@@ -1337,6 +1351,9 @@ def get_lr(it):
         if step > 0 and step > args.num_iterations - 20:
             timings.append(t1-t0)
 
+    if master_process and args.write_tensors and (not args.inference_only):
+        write_training_history(losses, norms, f"llama3_{model_size_str}_debug_state.bin")
+
     # print the average of the last 20 timings, to get something smooth-ish
     timings = timings[-20:]
     print0(f"final {len(timings)} iters avg: {np.mean(timings)*1000:.3f}ms")