Skip to content

Commit 2beb064

Browse files
committed
job packing
1 parent 9746e89 commit 2beb064

File tree

1 file changed

+84
-65
lines changed

1 file changed

+84
-65
lines changed

slurm/process_array.sbatch

Lines changed: 84 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,92 +1,111 @@
11
#!/bin/bash --login
22

3-
#SBATCH --job-name=process_swegnn
4-
#SBATCH --nodes=1
5-
#SBATCH --tasks-per-node=128
6-
#SBATCH --cpus-per-task=1
7-
#SBATCH --time=04:00:00
3+
#SBATCH --job-name=pack_swegnn
84
#SBATCH --account=n02-bas
95
#SBATCH --partition=standard
106
#SBATCH --qos=standard
117

12-
# --- JOB ARRAY ---
13-
# Replace '499' with your (total_runs - 1)
14-
# %128 means "run 128 tasks at a time"
15-
# SBATCH --array=0-499%128
8+
# --- STEP 1: THE "MANAGER" ARRAY ---
9+
# We need to process ~500 runs.
10+
# Each manager job will process 128 runs at a time.
11+
# So, we need ceil(500 / 128) = 4 array tasks.
12+
# Let's use 0-3 (which is 4 tasks).
13+
#
14+
#
15+
#SBATCH --array=0-2
16+
17+
# --- STEP 2: RESOURCE REQUEST (PER MANAGER) ---
18+
# Each *manager task* (0, 1, 2, or 3) gets ONE full node.
19+
# We will then use all 128 cores on that node.
20+
#SBATCH --nodes=1
21+
#SBATCH --tasks-per-node=128
22+
#SBATCH --time=01:00:00 # 1 hour to process 128 runs (adjust if needed)
1623

17-
# Output log file:
18-
# %x = job name, %A = job ID, %a = array task ID
24+
# Log file for each manager task
1925
#SBATCH --output=logs/%x-%A_%a.out
2026

21-
##SBATCH --mail-type=BEGIN,END,FAIL
22-
23-
2427
# --- Setup ---
2528
source /work/n02/n02/sdat2/.bashrc
26-
which micromamba
2729
micromamba activate t1
2830

2931
# Matplotlib cache
30-
mkdir $(pwd)'/matplotlib'
32+
mkdir -p $(pwd)'/matplotlib'
3133
export MPLCONFIGDIR=$(pwd)'/matplotlib'
3234

33-
echo "Which python:"
34-
which python
35+
echo "---"
36+
echo "Job Pack Manager Task: $SLURM_ARRAY_TASK_ID"
37+
echo "Node: $SLURM_NODELIST"
38+
echo "Cores: $SLURM_NTASKS"
3539
echo "---"
3640

3741
# --- Define Paths ---
3842
RUN_PARENT_DIR="/work/n02/n02/sdat2/adcirc-swan/worstsurge/run_5sec"
3943
SAVE_PARENT_DIR="/work/n02/n02/sdat2/adcirc-swan/worstsurge/swegnn_5sec"
40-
41-
# Create the main output directory
4244
mkdir -p $SAVE_PARENT_DIR
4345

44-
# --- Job Array Logic ---
45-
# 1. Create a bash array of all run directories
46+
# --- STEP 3: JOB PACKING LOGIC ---
47+
48+
# 1. Get the *full* list of all directories
4649
DIRS=($(find $RUN_PARENT_DIR -mindepth 1 -maxdepth 1 -type d | sort))
50+
NUM_DIRS=${#DIRS[@]}
51+
echo "Found $NUM_DIRS total directories to process."
52+
53+
# 2. Define how many runs this manager task will process
54+
TASKS_PER_NODE=128 # This MUST match --tasks-per-node above
4755

48-
# 2. Check if the array is empty
49-
if [ ${#DIRS[@]} -eq 0 ]; then
50-
echo "Error: No run directories found in $RUN_PARENT_DIR"
51-
exit 1
52-
fi
53-
54-
# 3. Get the specific directory for THIS Slurm array task
55-
MY_RUN_DIR=${DIRS[$SLURM_ARRAY_TASK_ID]}
56-
RUN_NAME=$(basename $MY_RUN_DIR)
57-
58-
# 4. Check if the directory was found
59-
if [ -z "$MY_RUN_DIR" ]; then
60-
echo "Error: Could not find directory for task ID $SLURM_ARRAY_TASK_ID."
61-
echo "Total directories found: ${#DIRS[@]}"
62-
exit 1
63-
fi
64-
65-
# 5. Define the full path for the output file
66-
MY_SAVE_PATH="$SAVE_PARENT_DIR/${RUN_NAME}.nc"
67-
68-
# --- Run the Job ---
69-
echo "Slurm Job Array Task: $SLURM_ARRAY_TASK_ID"
70-
echo "Processing Run Dir: $MY_RUN_DIR"
71-
echo "Saving Output To: $MY_SAVE_PATH"
56+
# 3. Calculate the *chunk* of directories this manager task is responsible for
57+
# Manager 0 (ID=0) processes dirs 0-127
58+
# Manager 1 (ID=1) processes dirs 128-255
59+
# ...and so on.
60+
START_INDEX=$(( $SLURM_ARRAY_TASK_ID * $TASKS_PER_NODE ))
61+
62+
echo "This manager (Task ID $SLURM_ARRAY_TASK_ID) will process $TASKS_PER_NODE runs,"
63+
echo "starting from directory index $START_INDEX."
7264
echo "---"
7365

74-
# --- CHOOSE YOUR MODE ---
75-
76-
# OPTION 1: Run with use_dask=False (My recommendation for Lustre)
77-
# The --use-dask flag is absent, so the script defaults to False.
78-
echo "Running with use_dask=False"
79-
python -m adforce.process_single \
80-
--run-path $MY_RUN_DIR \
81-
--save-path $MY_SAVE_PATH
82-
83-
# OPTION 2: Run with use_dask=True (To test your hypothesis)
84-
# Comment out the block above and uncomment this block.
85-
# The --use-dask flag is present, so the script sets it to True.
86-
# echo "Running with use_dask=True"
87-
# python -m adforce.process_single_run \
88-
# --run-path $MY_RUN_DIR \
89-
# --save-path $MY_SAVE_PATH \
90-
# --use-dask
91-
92-
echo "Task $SLURM_ARRAY_TASK_ID complete."
66+
# 4. Loop from 0 to 127
67+
for i in $(seq 0 $(( $TASKS_PER_NODE - 1 ))); do
68+
69+
# Get the *actual* directory index
70+
DIR_INDEX=$(( $START_INDEX + $i ))
71+
72+
# Check if this directory exists (handles the last partial chunk)
73+
if [ $DIR_INDEX -ge $NUM_DIRS ]; then
74+
echo "Index $DIR_INDEX is out of bounds (>= $NUM_DIRS). No more dirs."
75+
continue
76+
fi
77+
78+
# Get the directory name and output file name
79+
MY_RUN_DIR=${DIRS[$DIR_INDEX]}
80+
RUN_NAME=$(basename $MY_RUN_DIR)
81+
MY_SAVE_PATH="$SAVE_PARENT_DIR/${RUN_NAME}.nc"
82+
83+
echo "Launching task for: $RUN_NAME"
84+
85+
# --- STEP 4: LAUNCH THE PROCESS ---
86+
# This is the key:
87+
# 'srun' : The Slurm command to run a task.
88+
# '--nodes=1' : Use 1 node (from our allocation).
89+
# '--ntasks=1' : This process is 1 task.
90+
# '--cpus-per-task=1': This process gets 1 CPU.
91+
# '--exclusive' : Binds this process to its own core.
92+
# '--output=/dev/null': Discard stdout (logs go to the main file).
93+
# '&' : Run this in the BACKGROUND.
94+
95+
srun --nodes=1 --ntasks=1 --cpus-per-task=1 --exclusive --output=/dev/null \
96+
python -m adforce.process_single \
97+
--run-path $MY_RUN_DIR \
98+
--save-path $MY_SAVE_PATH &
99+
100+
# Add --use-dask here if you want to test it
101+
102+
done
103+
104+
# --- STEP 5: WAIT ---
105+
# The script will get here in ~1 second after launching 128 jobs.
106+
# 'wait' tells the script to pause here until all background (&)
107+
# processes have finished.
108+
echo "---"
109+
echo "All $TASKS_PER_NODE tasks launched. Waiting for them to complete..."
110+
wait
111+
echo "All tasks complete. Manager job finished."

0 commit comments

Comments
 (0)