Skip to content

Commit 17f69ea

Browse files
authored
Merge pull request #411 from bsaakash/master
abs - fixes to plom for remote run
2 parents d7993ba + b3f8933 commit 17f69ea

File tree

2 files changed

+133
-33
lines changed

2 files changed

+133
-33
lines changed

modules/performUQ/SimCenterUQ/runPLoM.py

Lines changed: 130 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
import shutil
4747
import subprocess
4848
import sys
49+
from datetime import datetime, timezone
50+
from pathlib import Path
4951

5052
import numpy as np
5153
import pandas as pd
@@ -173,7 +175,8 @@ def _run_simulation(self): # noqa: C901
173175
pass
174176
else:
175177
# pythonEXE = os.path.join(localAppDir,'applications','python','python.exe')
176-
pythonEXE = '"' + sys.executable + '"' # noqa: N806
178+
# pythonEXE = '"' + sys.executable + '"'
179+
pythonEXE = sys.executable # noqa: N806
177180
else:
178181
# for remote run and macOS, let's use system python
179182
pass
@@ -187,6 +190,7 @@ def _run_simulation(self): # noqa: C901
187190
if bldg_id is not None:
188191
os.chdir(bldg_id)
189192
os.chdir('templatedir')
193+
print(f'Current working directory: {os.getcwd()}') # noqa: PTH109, T201
190194

191195
# dakota script path
192196
dakotaScript = os.path.join( # noqa: PTH118, N806
@@ -216,41 +220,131 @@ def _run_simulation(self): # noqa: C901
216220
# command line
217221
# KZ modified 0331
218222
# command_line = f'{pythonEXE} {dakotaScript} --workflowInput sc_dakota_plom.json --driverFile {os.path.splitext(self.workflow_driver)[0]} --workflowOutput EDP.json --runType {runType}'
219-
command_line = f'{pythonEXE} {dakotaScript} --workflowInput sc_dakota_plom.json --driverFile {os.path.splitext(self.workflow_driver)[0]} --workflowOutput EDP.json --runType runningLocal' # noqa: PTH122
220-
print(command_line) # noqa: T201
221-
# run command
223+
# command_line = f'{pythonEXE} {dakotaScript} --workflowInput sc_dakota_plom.json --driverFile {os.path.splitext(self.workflow_driver)[0]} --workflowOutput EDP.json --runType runningLocal'
224+
# print(command_line)
225+
226+
# Build the command as a list (no shell interpretation)
227+
command_line = [
228+
pythonEXE,
229+
dakotaScript,
230+
'--workflowInput',
231+
'sc_dakota_plom.json',
232+
'--driverFile',
233+
os.path.splitext(self.workflow_driver)[0], # noqa: PTH122
234+
'--workflowOutput',
235+
'EDP.json',
236+
'--runType',
237+
'runningLocal',
238+
]
239+
240+
print( # noqa: T201
241+
'Command to run:', ' '.join(command_line)
242+
) # for debugging
243+
222244
dakotaTabPath = os.path.join(self.work_dir, 'dakotaTab.out') # noqa: PTH118, N806
223245
print(dakotaTabPath) # noqa: T201
224246

247+
# try:
248+
# os.system(command_line)
249+
# except:
250+
# print(
251+
# 'runPLoM._run_simulation: error in running dakota to generate the initial sample.'
252+
# )
253+
# print(
254+
# 'runPLoM._run_simulation: please check if the dakota is installed correctly on the system.'
255+
# )
256+
257+
# if not os.path.exists(dakotaTabPath):
258+
# try:
259+
# subprocess.call(command_line)
260+
# except:
261+
# print(
262+
# 'runPLoM._run_simulation: error in running dakota to generate the initial sample.'
263+
# )
264+
# print(
265+
# 'runPLoM._run_simulation: please check if the dakota is installed correctly on the system.'
266+
# )
267+
268+
# if not os.path.exists(dakotaTabPath):
269+
# msg = 'Dakota preprocessor did not run successfully'
270+
# self.errlog.exit(msg)
271+
272+
# decide if we're on HPC/remote
273+
is_remote = job_config.get('runType', 'runningLocal') == 'runningRemote'
274+
275+
# build a sanitized env ONLY for the child process running Dakota on remote
276+
env = os.environ.copy()
277+
removed = []
278+
if is_remote:
279+
# Strip common MPI/PMI launch-context vars so Dakota won't try MPI_Init via PMI
280+
prefixes = (
281+
'PMI_',
282+
'OMPI_',
283+
'MPI_',
284+
'I_MPI_',
285+
'HYDRA_',
286+
'SLURM_',
287+
'PMIX_',
288+
'MPICH_',
289+
'UCX_',
290+
'FI_',
291+
'PALS_',
292+
)
293+
for k in list(env):
294+
if k.startswith(prefixes):
295+
removed.append(k)
296+
env.pop(k, None)
297+
if removed:
298+
print( # noqa: T201
299+
f'sanitized env for dakota run (removed {len(removed)} keys): {removed[:6]} ...'
300+
)
301+
302+
completed = None
225303
try:
226-
os.system(command_line) # noqa: S605
227-
except: # noqa: E722
304+
# run DakotaUQ with the chosen env
305+
completed = subprocess.run( # noqa: S603
306+
command_line,
307+
check=True, # raise if exit code != 0
308+
capture_output=True, # capture BOTH stdout and stderr
309+
text=True, # return str, not bytes
310+
env=env if is_remote else None, # use sanitized env on remote
311+
)
312+
except subprocess.CalledProcessError as e:
228313
print( # noqa: T201
229314
'runPLoM._run_simulation: error in running dakota to generate the initial sample.'
230315
)
231316
print( # noqa: T201
232317
'runPLoM._run_simulation: please check if the dakota is installed correctly on the system.'
233318
)
234-
235-
if not os.path.exists(dakotaTabPath): # noqa: PTH110
236-
try:
237-
subprocess.call(command_line) # noqa: S603
238-
except: # noqa: E722
239-
print( # noqa: T201
240-
'runPLoM._run_simulation: error in running dakota to generate the initial sample.'
241-
)
242-
print( # noqa: T201
243-
'runPLoM._run_simulation: please check if the dakota is installed correctly on the system.'
244-
)
245-
319+
print(f'Command: {" ".join(e.cmd)}') # noqa: T201
320+
print(f'Return code: {e.returncode}') # noqa: T201
321+
if e.stdout:
322+
print('---- STDOUT ----') # noqa: T201
323+
print(e.stdout) # noqa: T201
324+
if e.stderr:
325+
print('---- STDERR ----') # noqa: T201
326+
print(e.stderr) # noqa: T201
327+
self.errlog.exit('Dakota did not run successfully')
328+
329+
# Verify expected output exists
246330
if not os.path.exists(dakotaTabPath): # noqa: PTH110
247-
msg = 'Dakota preprocessor did not run successfully'
248-
self.errlog.exit(msg)
331+
print( # noqa: T201
332+
'runPLoM._run_simulation: Dakota finished without creating the expected output file.'
333+
)
334+
print(f'Expected file missing: {dakotaTabPath}') # noqa: T201
335+
if completed is not None:
336+
if completed.stdout:
337+
print('---- STDOUT ----') # noqa: T201
338+
print(completed.stdout) # noqa: T201
339+
if completed.stderr:
340+
print('---- STDERR ----') # noqa: T201
341+
print(completed.stderr) # noqa: T201
342+
self.errlog.exit('Dakota did not run successfully')
249343

250344
# remove the new dakota.json
251345
# os.remove('sc_dakota_plom.json')
252346

253-
if runType in ['run', 'runningLocal']:
347+
if runType in ['run', 'runningLocal', 'runningRemote']:
254348
# create the response.csv file from the dakotaTab.out file
255349
os.chdir(run_dir)
256350
if bldg_id is not None:
@@ -273,7 +367,7 @@ def _run_simulation(self): # noqa: C901
273367
)
274368
self.job_config = job_config
275369

276-
elif self.run_type in ['set_up', 'runningRemote']:
370+
elif self.run_type in ['set_up']:
277371
pass
278372

279373
def _prepare_training_data(self, run_dir): # noqa: C901
@@ -681,7 +775,7 @@ def _load_hyperparameter(self):
681775
if (
682776
self.constraintsFlag
683777
): # sy - added because quoFEM/EE-UQ example failed 09/10/2024
684-
constr_file = Path(self.constraintsFile).resolve() # noqa: F405
778+
constr_file = Path(self.constraintsFile).resolve()
685779
sys.path.insert(0, str(constr_file.parent) + '/')
686780
constr_script = importlib.__import__( # noqa: F405
687781
constr_file.name[:-3], globals(), locals(), [], 0
@@ -690,7 +784,7 @@ def _load_hyperparameter(self):
690784
print('beta_c = ', self.beta_c) # noqa: T201
691785
# if smootherKDE
692786
if self.smootherKDE_Customize:
693-
kde_file = Path(self.smootherKDE_file).resolve() # noqa: F405
787+
kde_file = Path(self.smootherKDE_file).resolve()
694788
sys.path.insert(0, str(kde_file.parent) + '/')
695789
kde_script = importlib.__import__( # noqa: F405
696790
kde_file.name[:-3], globals(), locals(), [], 0
@@ -701,7 +795,7 @@ def _load_hyperparameter(self):
701795
print('epsilon_k = ', self.smootherKDE) # noqa: T201
702796
# if tolKDE
703797
if self.kdeTolerance_Customize:
704-
beta_file = Path(self.kdeTolerance_file).resolve() # noqa: F405
798+
beta_file = Path(self.kdeTolerance_file).resolve()
705799
sys.path.insert(0, str(beta_file.parent) + '/')
706800
beta_script = importlib.__import__( # noqa: F405
707801
beta_file.name[:-3], globals(), locals(), [], 0
@@ -966,13 +1060,19 @@ def read_txt(text_dir, errlog): # noqa: D103
9661060

9671061
class errorLog: # noqa: D101
9681062
def __init__(self, work_dir):
969-
self.file = open(f'{work_dir}/dakota.err', 'w') # noqa: SIM115, PTH123
1063+
self.path = Path(work_dir) / 'dakota.err'
1064+
self.path.parent.mkdir(parents=True, exist_ok=True) # ensure dir exists
9701065

9711066
def exit(self, msg): # noqa: D102
972-
print(msg) # noqa: T201
973-
self.file.write(msg)
974-
self.file.close()
975-
exit(-1) # noqa: PLR1722
1067+
print(msg, file=sys.stderr) # also send to stderr # noqa: T201
1068+
try:
1069+
with self.path.open('a', encoding='utf-8', errors='replace') as f:
1070+
ts = datetime.now(tz=timezone.utc).isoformat(timespec='seconds')
1071+
f.write(f'\n\n---- PLoM error @ {ts} ----\n')
1072+
f.write(msg.rstrip() + '\n')
1073+
except Exception as e: # noqa: BLE001
1074+
print(f'[WARN] Could not write to {self.path}: {e}', file=sys.stderr) # noqa: T201
1075+
sys.exit(-1) # nonzero exit for failure
9761076

9771077

9781078
def build_surrogate(work_dir, os_type, run_type, input_file, workflow_driver):

modules/performUQ/dakota/DakotaUQ.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,9 +142,9 @@ def main(args): # noqa: C901, D103
142142
returncode = e.returncode # noqa: F841
143143
run_success = False
144144

145-
dakotaErrFile = os.path.join(cwd, 'dakota.err') # noqa: N806, PTH109, PTH118, RUF100, W291
146-
dakotaOutFile = os.path.join(cwd, 'dakota.out') # noqa: N806, PTH109, PTH118, RUF100
147-
dakotaTabFile = os.path.join(cwd, 'dakotaTab.out') # noqa: N806, PTH109, PTH118, RUF100
145+
dakotaErrFile = os.path.join(cwd, 'dakota.err') # noqa: N806, PTH109, PTH118
146+
dakotaOutFile = os.path.join(cwd, 'dakota.out') # noqa: N806, PTH109, PTH118
147+
dakotaTabFile = os.path.join(cwd, 'dakotaTab.out') # noqa: N806, PTH109, PTH118
148148
checkErrFile = os.path.exists(dakotaErrFile) # noqa: PTH110, N806
149149
checkOutFile = os.path.exists(dakotaOutFile) # noqa: PTH110, N806
150150
checkTabFile = os.path.exists(dakotaTabFile) # noqa: F841, N806, PTH110

0 commit comments

Comments
 (0)