Switch from torchrun to elastic library

luciaquirke · luciaquirke · commit 7403cff6d95f · 2025-11-18T01:46:37.000Z
diff --git a/README.md b/README.md
@@ -113,16 +113,16 @@ We currently don't support fine-grained manual control over the learning rate, n
 
 ## Distributed training
 
-We support distributed training via PyTorch's `torchrun` command. By default we use the Distributed Data Parallel method, which means that the weights of each SAE are replicated on every GPU.
+We support distributed training out of the box using Torch Distributed Elastic. By default we use Distributed Data Parallel with all visible GPUs, which means that the weights of each SAE are replicated on every GPU.
 
 ```bash
-torchrun --nproc_per_node gpu -m sparsify meta-llama/Meta-Llama-3-8B --batch_size 1 --layers 16 24 --k 192 --grad_acc_steps 8 --ctx_len 2048
+python -m sparsify meta-llama/Meta-Llama-3-8B --batch_size 1 --layers 16 24 --k 192 --grad_acc_steps 8 --ctx_len 2048
 ```
 
 This is simple, but very memory inefficient. If you want to train SAEs for many layers of a model, we recommend using the `--distribute_modules` flag, which allocates the SAEs for different layers to different GPUs. Currently, we require that the number of GPUs evenly divides the number of layers you're training SAEs for.
 
 ```bash
-torchrun --nproc_per_node gpu -m sparsify meta-llama/Meta-Llama-3-8B --distribute_modules --batch_size 1 --layer_stride 2 --grad_acc_steps 8 --ctx_len 2048 --k 192 --load_in_8bit --micro_acc_steps 2
+python -m sparsify meta-llama/Meta-Llama-3-8B --distribute_modules --batch_size 1 --layer_stride 2 --grad_acc_steps 8 --ctx_len 2048 --k 192 --load_in_8bit --micro_acc_steps 2
 ```
 
 The above command trains an SAE for every _even_ layer of Llama 3 8B, using all available GPUs. It accumulates gradients over 8 minibatches, and splits each minibatch into 2 microbatches before feeding them into the SAE encoder, thus saving a lot of memory. It also loads the model in 8-bit precision using `bitsandbytes`. This command requires no more than 48GB of memory per GPU on an 8 GPU node.
diff --git a/sparsify/__main__.py b/sparsify/__main__.py
@@ -18,6 +18,7 @@
 )
 
 from .data import MemmapDataset, chunk_and_tokenize
+from .distributed import handle_distribute
 from .trainer import TrainConfig, Trainer
 from .utils import simple_parse_args_string
 
@@ -77,31 +78,7 @@ class RunConfig(TrainConfig):
     format 'arg1=val1,arg2=val2'."""
 
 
-def load_artifacts(
-    args: RunConfig, rank: int
-) -> tuple[PreTrainedModel, Dataset | MemmapDataset]:
-    if args.load_in_8bit:
-        dtype = torch.float16
-    elif torch.cuda.is_bf16_supported():
-        dtype = torch.bfloat16
-    else:
-        dtype = "auto"
-
-    # End-to-end training requires a model with a causal LM head
-    model_cls = AutoModel if args.loss_fn == "fvu" else AutoModelForCausalLM
-    model = model_cls.from_pretrained(
-        args.model,
-        device_map={"": f"cuda:{rank}"},
-        quantization_config=(
-            BitsAndBytesConfig(load_in_8bit=args.load_in_8bit)
-            if args.load_in_8bit
-            else None
-        ),
-        revision=args.revision,
-        torch_dtype=dtype,
-        token=args.hf_token,
-    )
-
+def load_data(args: RunConfig):
     # For memmap-style datasets
     if args.dataset.endswith(".bin"):
         dataset = MemmapDataset(args.dataset, args.ctx_len, args.max_examples)
@@ -137,10 +114,36 @@ def load_artifacts(
         if limit := args.max_examples:
             dataset = dataset.select(range(limit))
 
-    return model, dataset
+    return dataset
 
 
-def run():
+def load_model_artifact(args: RunConfig, rank: int) -> PreTrainedModel:
+    if args.load_in_8bit:
+        dtype = torch.float16
+    elif torch.cuda.is_bf16_supported():
+        dtype = torch.bfloat16
+    else:
+        dtype = "auto"
+
+    # End-to-end training requires a model with a causal LM head
+    model_cls = AutoModel if args.loss_fn == "fvu" else AutoModelForCausalLM
+    model = model_cls.from_pretrained(
+        args.model,
+        device_map={"": f"cuda:{rank}"},
+        quantization_config=(
+            BitsAndBytesConfig(load_in_8bit=args.load_in_8bit)
+            if args.load_in_8bit
+            else None
+        ),
+        revision=args.revision,
+        torch_dtype=dtype,
+        token=args.hf_token,
+    )
+
+    return model
+
+
+def worker(args: RunConfig, dataset: Dataset | MemmapDataset):
     local_rank = os.environ.get("LOCAL_RANK")
     ddp = local_rank is not None
     rank = int(local_rank) if ddp else 0
@@ -157,17 +160,15 @@ def run():
         if rank == 0:
             print(f"Using DDP across {dist.get_world_size()} GPUs.")
 
-    args = parse(RunConfig)
-
     # Prevent ranks other than 0 from printing
     with nullcontext() if rank == 0 else redirect_stdout(None):
         # Awkward hack to prevent other ranks from duplicating data preprocessing
         if not ddp or rank == 0:
-            model, dataset = load_artifacts(args, rank)
+            model = load_model_artifact(args, rank)
         if ddp:
             dist.barrier()
             if rank != 0:
-                model, dataset = load_artifacts(args, rank)
+                model = load_model_artifact(args, rank)
 
             # Drop examples that are indivisible across processes to prevent deadlock
             remainder_examples = len(dataset) % dist.get_world_size()
@@ -196,5 +197,17 @@ def run():
         trainer.fit()
 
 
+def run():
+    args = parse(RunConfig)
+
+    dataset = load_data(args)
+
+    handle_distribute(
+        process_name="sparsify",
+        worker=worker,
+        const_worker_args=[args, dataset],
+    )
+
+
 if __name__ == "__main__":
     run()
diff --git a/sparsify/distributed.py b/sparsify/distributed.py
@@ -0,0 +1,78 @@
+import socket
+from typing import Any, Callable
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from torch.distributed.elastic.multiprocessing import DefaultLogsSpecs, start_processes
+
+
+def dist_worker(
+    worker: Callable,
+    *worker_args,
+):
+    try:
+        worker(*worker_args)
+    finally:
+        if dist.is_initialized():
+            try:
+                dist.barrier()
+            except Exception as e:
+                print(f"Barrier failed during cleanup: {e}")
+                pass
+
+            dist.destroy_process_group()
+
+
+def handle_distribute(process_name: str, worker, const_worker_args: list[Any]):
+    """
+    Launch a distributed multi-process job over all visible CUDA devices.
+
+    Parameters
+    ----------
+    process_name : str
+        Label used by Torch Elastic to tag logs and processes.
+    worker : Callable
+        Function that will be executed on every spawned process. It must accept
+        ``(rank, world_size, *const_worker_args)`` in that order.
+    const_worker_args : list
+        Arguments passed verbatim to every worker invocation after ``rank`` and
+        ``world_size``. These are typically configuration or shared datasets.
+    """
+    world_size = torch.cuda.device_count()
+    if world_size <= 1:
+        # Run the worker directly if no distributed training is needed. This is great
+        # for debugging purposes.
+        worker(0, 1, *const_worker_args)
+    else:
+        # Set up multiprocessing and distributed training
+        mp.set_sharing_strategy("file_system")
+
+        # Find an available port for distributed training
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            _, port = s.getsockname()
+
+        ctx = None
+        try:
+            ctx = start_processes(
+                process_name,
+                dist_worker,
+                args={
+                    i: (worker, i, world_size, *const_worker_args)
+                    for i in range(world_size)
+                },
+                envs={
+                    i: {
+                        "LOCAL_RANK": str(i),
+                        "MASTER_ADDR": "localhost",
+                        "MASTER_PORT": str(port),
+                    }
+                    for i in range(world_size)
+                },
+                logs_specs=DefaultLogsSpecs(),
+            )
+            ctx.wait()
+        finally:
+            if ctx is not None:
+                ctx.close()  # Kill any processes that are still running