11#! /bin/bash --login
22
3- # SBATCH --job-name=process_swegnn
4- # SBATCH --nodes=1
5- # SBATCH --tasks-per-node=128
6- # SBATCH --cpus-per-task=1
7- # SBATCH --time=04:00:00
3+ # SBATCH --job-name=pack_swegnn
84# SBATCH --account=n02-bas
95# SBATCH --partition=standard
106# SBATCH --qos=standard
117
12- # --- JOB ARRAY ---
13- # Replace '499' with your (total_runs - 1)
14- # %128 means "run 128 tasks at a time"
15- # SBATCH --array=0-499%128
8+ # --- STEP 1: THE "MANAGER" ARRAY ---
9+ # We need to process ~500 runs.
10+ # Each manager job will process 128 runs at a time.
11+ # So, we need ceil(500 / 128) = 4 array tasks.
12+ # Let's use 0-3 (which is 4 tasks).
13+ #
14+ #
15+ # SBATCH --array=0-2
16+
17+ # --- STEP 2: RESOURCE REQUEST (PER MANAGER) ---
18+ # Each *manager task* (0, 1, 2, or 3) gets ONE full node.
19+ # We will then use all 128 cores on that node.
20+ # SBATCH --nodes=1
21+ # SBATCH --tasks-per-node=128
22+ # SBATCH --time=01:00:00 # 1 hour to process 128 runs (adjust if needed)
1623
17- # Output log file:
18- # %x = job name, %A = job ID, %a = array task ID
24+ # Log file for each manager task
1925# SBATCH --output=logs/%x-%A_%a.out
2026
21- # #SBATCH --mail-type=BEGIN,END,FAIL
22- 23-
2427# --- Setup ---
2528source /work/n02/n02/sdat2/.bashrc
26- which micromamba
2729micromamba activate t1
2830
2931# Matplotlib cache
30- mkdir $( pwd) ' /matplotlib'
32+ mkdir -p $( pwd) ' /matplotlib'
3133export MPLCONFIGDIR=$( pwd) ' /matplotlib'
3234
33- echo " Which python:"
34- which python
35+ echo " ---"
36+ echo " Job Pack Manager Task: $SLURM_ARRAY_TASK_ID "
37+ echo " Node: $SLURM_NODELIST "
38+ echo " Cores: $SLURM_NTASKS "
3539echo " ---"
3640
3741# --- Define Paths ---
3842RUN_PARENT_DIR=" /work/n02/n02/sdat2/adcirc-swan/worstsurge/run_5sec"
3943SAVE_PARENT_DIR=" /work/n02/n02/sdat2/adcirc-swan/worstsurge/swegnn_5sec"
40-
41- # Create the main output directory
4244mkdir -p $SAVE_PARENT_DIR
4345
44- # --- Job Array Logic ---
45- # 1. Create a bash array of all run directories
46+ # --- STEP 3: JOB PACKING LOGIC ---
47+
48+ # 1. Get the *full* list of all directories
4649DIRS=($( find $RUN_PARENT_DIR -mindepth 1 -maxdepth 1 -type d | sort) )
50+ NUM_DIRS=${# DIRS[@]}
51+ echo " Found $NUM_DIRS total directories to process."
52+
53+ # 2. Define how many runs this manager task will process
54+ TASKS_PER_NODE=128 # This MUST match --tasks-per-node above
4755
48- # 2. Check if the array is empty
49- if [ ${# DIRS[@]} -eq 0 ]; then
50- echo " Error: No run directories found in $RUN_PARENT_DIR "
51- exit 1
52- fi
53-
54- # 3. Get the specific directory for THIS Slurm array task
55- MY_RUN_DIR=${DIRS[$SLURM_ARRAY_TASK_ID]}
56- RUN_NAME=$( basename $MY_RUN_DIR )
57-
58- # 4. Check if the directory was found
59- if [ -z " $MY_RUN_DIR " ]; then
60- echo " Error: Could not find directory for task ID $SLURM_ARRAY_TASK_ID ."
61- echo " Total directories found: ${# DIRS[@]} "
62- exit 1
63- fi
64-
65- # 5. Define the full path for the output file
66- MY_SAVE_PATH=" $SAVE_PARENT_DIR /${RUN_NAME} .nc"
67-
68- # --- Run the Job ---
69- echo " Slurm Job Array Task: $SLURM_ARRAY_TASK_ID "
70- echo " Processing Run Dir: $MY_RUN_DIR "
71- echo " Saving Output To: $MY_SAVE_PATH "
56+ # 3. Calculate the *chunk* of directories this manager task is responsible for
57+ # Manager 0 (ID=0) processes dirs 0-127
58+ # Manager 1 (ID=1) processes dirs 128-255
59+ # ...and so on.
60+ START_INDEX=$(( $SLURM_ARRAY_TASK_ID * $TASKS_PER_NODE ))
61+
62+ echo " This manager (Task ID $SLURM_ARRAY_TASK_ID ) will process $TASKS_PER_NODE runs,"
63+ echo " starting from directory index $START_INDEX ."
7264echo " ---"
7365
74- # --- CHOOSE YOUR MODE ---
75-
76- # OPTION 1: Run with use_dask=False (My recommendation for Lustre)
77- # The --use-dask flag is absent, so the script defaults to False.
78- echo " Running with use_dask=False"
79- python -m adforce.process_single \
80- --run-path $MY_RUN_DIR \
81- --save-path $MY_SAVE_PATH
82-
83- # OPTION 2: Run with use_dask=True (To test your hypothesis)
84- # Comment out the block above and uncomment this block.
85- # The --use-dask flag is present, so the script sets it to True.
86- # echo "Running with use_dask=True"
87- # python -m adforce.process_single_run \
88- # --run-path $MY_RUN_DIR \
89- # --save-path $MY_SAVE_PATH \
90- # --use-dask
91-
92- echo " Task $SLURM_ARRAY_TASK_ID complete."
66+ # 4. Loop from 0 to 127
67+ for i in $( seq 0 $(( $TASKS_PER_NODE - 1 )) ) ; do
68+
69+ # Get the *actual* directory index
70+ DIR_INDEX=$(( $START_INDEX + $i ))
71+
72+ # Check if this directory exists (handles the last partial chunk)
73+ if [ $DIR_INDEX -ge $NUM_DIRS ]; then
74+ echo " Index $DIR_INDEX is out of bounds (>= $NUM_DIRS ). No more dirs."
75+ continue
76+ fi
77+
78+ # Get the directory name and output file name
79+ MY_RUN_DIR=${DIRS[$DIR_INDEX]}
80+ RUN_NAME=$( basename $MY_RUN_DIR )
81+ MY_SAVE_PATH=" $SAVE_PARENT_DIR /${RUN_NAME} .nc"
82+
83+ echo " Launching task for: $RUN_NAME "
84+
85+ # --- STEP 4: LAUNCH THE PROCESS ---
86+ # This is the key:
87+ # 'srun' : The Slurm command to run a task.
88+ # '--nodes=1' : Use 1 node (from our allocation).
89+ # '--ntasks=1' : This process is 1 task.
90+ # '--cpus-per-task=1': This process gets 1 CPU.
91+ # '--exclusive' : Binds this process to its own core.
92+ # '--output=/dev/null': Discard stdout (logs go to the main file).
93+ # '&' : Run this in the BACKGROUND.
94+
95+ srun --nodes=1 --ntasks=1 --cpus-per-task=1 --exclusive --output=/dev/null \
96+ python -m adforce.process_single \
97+ --run-path $MY_RUN_DIR \
98+ --save-path $MY_SAVE_PATH &
99+
100+ # Add --use-dask here if you want to test it
101+
102+ done
103+
104+ # --- STEP 5: WAIT ---
105+ # The script will get here in ~1 second after launching 128 jobs.
106+ # 'wait' tells the script to pause here until all background (&)
107+ # processes have finished.
108+ echo " ---"
109+ echo " All $TASKS_PER_NODE tasks launched. Waiting for them to complete..."
110+ wait
111+ echo " All tasks complete. Manager job finished."
0 commit comments