job packing

sdat2 · sdat2 · commit 2beb064ad280 · 2025-10-28T14:43:49.000Z
diff --git a/slurm/process_array.sbatch b/slurm/process_array.sbatch
@@ -1,92 +1,111 @@
 #!/bin/bash --login
 
-#SBATCH --job-name=process_swegnn
-#SBATCH --nodes=1
-#SBATCH --tasks-per-node=128
-#SBATCH --cpus-per-task=1
-#SBATCH --time=04:00:00
+#SBATCH --job-name=pack_swegnn
 #SBATCH --account=n02-bas
 #SBATCH --partition=standard
 #SBATCH --qos=standard
 
-# --- JOB ARRAY ---
-# Replace '499' with your (total_runs - 1)
-# %128 means "run 128 tasks at a time"
-# SBATCH --array=0-499%128 
+# --- STEP 1: THE "MANAGER" ARRAY ---
+# We need to process ~500 runs.
+# Each manager job will process 128 runs at a time.
+# So, we need ceil(500 / 128) = 4 array tasks.
+# Let's use 0-3 (which is 4 tasks).
+#
+#
+#SBATCH --array=0-2
+
+# --- STEP 2: RESOURCE REQUEST (PER MANAGER) ---
+# Each *manager task* (0, 1, 2, or 3) gets ONE full node.
+# We will then use all 128 cores on that node.
+#SBATCH --nodes=1
+#SBATCH --tasks-per-node=128
+#SBATCH --time=01:00:00  # 1 hour to process 128 runs (adjust if needed)
 
-# Output log file:
-# %x = job name, %A = job ID, %a = array task ID
+# Log file for each manager task
 #SBATCH --output=logs/%x-%A_%a.out
 
-##SBATCH --mail-type=BEGIN,END,FAIL
-##SBATCH --mail-user=sdat2@cam.ac.uk
-
 # --- Setup ---
 source /work/n02/n02/sdat2/.bashrc
-which micromamba
 micromamba activate t1
 
 # Matplotlib cache
-mkdir $(pwd)'/matplotlib'
+mkdir -p $(pwd)'/matplotlib'
 export MPLCONFIGDIR=$(pwd)'/matplotlib'
 
-echo "Which python:"
-which python
+echo "---"
+echo "Job Pack Manager Task: $SLURM_ARRAY_TASK_ID"
+echo "Node: $SLURM_NODELIST"
+echo "Cores: $SLURM_NTASKS"
 echo "---"
 
 # --- Define Paths ---
 RUN_PARENT_DIR="/work/n02/n02/sdat2/adcirc-swan/worstsurge/run_5sec"
 SAVE_PARENT_DIR="/work/n02/n02/sdat2/adcirc-swan/worstsurge/swegnn_5sec"
-
-# Create the main output directory
 mkdir -p $SAVE_PARENT_DIR
 
-# --- Job Array Logic ---
-# 1. Create a bash array of all run directories
+# --- STEP 3: JOB PACKING LOGIC ---
+
+# 1. Get the *full* list of all directories
 DIRS=($(find $RUN_PARENT_DIR -mindepth 1 -maxdepth 1 -type d | sort))
+NUM_DIRS=${#DIRS[@]}
+echo "Found $NUM_DIRS total directories to process."
+
+# 2. Define how many runs this manager task will process
+TASKS_PER_NODE=128 # This MUST match --tasks-per-node above
 
-# 2. Check if the array is empty
-if [ ${#DIRS[@]} -eq 0 ]; then
-    echo "Error: No run directories found in $RUN_PARENT_DIR"
-    exit 1
-fi
-
-# 3. Get the specific directory for THIS Slurm array task
-MY_RUN_DIR=${DIRS[$SLURM_ARRAY_TASK_ID]}
-RUN_NAME=$(basename $MY_RUN_DIR)
-
-# 4. Check if the directory was found
-if [ -z "$MY_RUN_DIR" ]; then
-    echo "Error: Could not find directory for task ID $SLURM_ARRAY_TASK_ID."
-    echo "Total directories found: ${#DIRS[@]}"
-    exit 1
-fi
-
-# 5. Define the full path for the output file
-MY_SAVE_PATH="$SAVE_PARENT_DIR/${RUN_NAME}.nc"
-
-# --- Run the Job ---
-echo "Slurm Job Array Task: $SLURM_ARRAY_TASK_ID"
-echo "Processing Run Dir: $MY_RUN_DIR"
-echo "Saving Output To:   $MY_SAVE_PATH"
+# 3. Calculate the *chunk* of directories this manager task is responsible for
+# Manager 0 (ID=0) processes dirs 0-127
+# Manager 1 (ID=1) processes dirs 128-255
+# ...and so on.
+START_INDEX=$(( $SLURM_ARRAY_TASK_ID * $TASKS_PER_NODE ))
+
+echo "This manager (Task ID $SLURM_ARRAY_TASK_ID) will process $TASKS_PER_NODE runs,"
+echo "starting from directory index $START_INDEX."
 echo "---"
 
-# --- CHOOSE YOUR MODE ---
-
-# OPTION 1: Run with use_dask=False (My recommendation for Lustre)
-# The --use-dask flag is absent, so the script defaults to False.
-echo "Running with use_dask=False"
-python -m adforce.process_single \
-    --run-path $MY_RUN_DIR \
-    --save-path $MY_SAVE_PATH
-
-# OPTION 2: Run with use_dask=True (To test your hypothesis)
-# Comment out the block above and uncomment this block.
-# The --use-dask flag is present, so the script sets it to True.
-# echo "Running with use_dask=True"
-# python -m adforce.process_single_run \
-#     --run-path $MY_RUN_DIR \
-#     --save-path $MY_SAVE_PATH \
-#     --use-dask
-
-echo "Task $SLURM_ARRAY_TASK_ID complete."
+# 4. Loop from 0 to 127
+for i in $(seq 0 $(( $TASKS_PER_NODE - 1 ))); do
+    
+    # Get the *actual* directory index
+    DIR_INDEX=$(( $START_INDEX + $i ))
+    
+    # Check if this directory exists (handles the last partial chunk)
+    if [ $DIR_INDEX -ge $NUM_DIRS ]; then
+        echo "Index $DIR_INDEX is out of bounds (>= $NUM_DIRS). No more dirs."
+        continue
+    fi
+    
+    # Get the directory name and output file name
+    MY_RUN_DIR=${DIRS[$DIR_INDEX]}
+    RUN_NAME=$(basename $MY_RUN_DIR)
+    MY_SAVE_PATH="$SAVE_PARENT_DIR/${RUN_NAME}.nc"
+
+    echo "Launching task for: $RUN_NAME"
+
+    # --- STEP 4: LAUNCH THE PROCESS ---
+    # This is the key:
+    # 'srun'         : The Slurm command to run a task.
+    # '--nodes=1'      : Use 1 node (from our allocation).
+    # '--ntasks=1'     : This process is 1 task.
+    # '--cpus-per-task=1': This process gets 1 CPU.
+    # '--exclusive'  : Binds this process to its own core.
+    # '--output=/dev/null': Discard stdout (logs go to the main file).
+    # '&'              : Run this in the BACKGROUND.
+    
+    srun --nodes=1 --ntasks=1 --cpus-per-task=1 --exclusive --output=/dev/null \
+    python -m adforce.process_single \
+        --run-path $MY_RUN_DIR \
+        --save-path $MY_SAVE_PATH &
+
+    # Add --use-dask here if you want to test it
+    
+done
+
+# --- STEP 5: WAIT ---
+# The script will get here in ~1 second after launching 128 jobs.
+# 'wait' tells the script to pause here until all background (&)
+# processes have finished.
+echo "---"
+echo "All $TASKS_PER_NODE tasks launched. Waiting for them to complete..."
+wait
+echo "All tasks complete. Manager job finished."