From eec7c86ddd3d58ef50a07e64fd1e83e799dba9a0 Mon Sep 17 00:00:00 2001 From: bsaakash <11618528+bsaakash@users.noreply.github.com> Date: Mon, 11 Aug 2025 22:43:37 -0700 Subject: [PATCH 1/8] abs - improving the dakota call in plom and showing more descriptive error messages --- modules/performUQ/SimCenterUQ/runPLoM.py | 99 +++++++++++++++++++----- 1 file changed, 80 insertions(+), 19 deletions(-) diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py index c9c94a52a..382cba000 100644 --- a/modules/performUQ/SimCenterUQ/runPLoM.py +++ b/modules/performUQ/SimCenterUQ/runPLoM.py @@ -187,6 +187,7 @@ def _run_simulation(self): # noqa: C901 if bldg_id is not None: os.chdir(bldg_id) os.chdir('templatedir') + print(f'Current working directory: {os.getcwd()}') # noqa: PTH109, T201 # dakota script path dakotaScript = os.path.join( # noqa: PTH118, N806 @@ -216,36 +217,96 @@ def _run_simulation(self): # noqa: C901 # command line # KZ modified 0331 # command_line = f'{pythonEXE} {dakotaScript} --workflowInput sc_dakota_plom.json --driverFile {os.path.splitext(self.workflow_driver)[0]} --workflowOutput EDP.json --runType {runType}' - command_line = f'{pythonEXE} {dakotaScript} --workflowInput sc_dakota_plom.json --driverFile {os.path.splitext(self.workflow_driver)[0]} --workflowOutput EDP.json --runType runningLocal' # noqa: PTH122 - print(command_line) # noqa: T201 - # run command + # command_line = f'{pythonEXE} {dakotaScript} --workflowInput sc_dakota_plom.json --driverFile {os.path.splitext(self.workflow_driver)[0]} --workflowOutput EDP.json --runType runningLocal' + # print(command_line) + + # Build the command as a list (no shell interpretation) + command_line = [ + pythonEXE, + dakotaScript, + '--workflowInput', + 'sc_dakota_plom.json', + '--driverFile', + os.path.splitext(self.workflow_driver)[0], # noqa: PTH122 + '--workflowOutput', + 'EDP.json', + '--runType', + 'runningLocal', + ] + + print( # noqa: T201 + 'Command to run dakota:', ' '.join(command_line) + ) # for debugging + dakotaTabPath = os.path.join(self.work_dir, 'dakotaTab.out') # noqa: PTH118, N806 print(dakotaTabPath) # noqa: T201 + # try: + # os.system(command_line) + # except: + # print( + # 'runPLoM._run_simulation: error in running dakota to generate the initial sample.' + # ) + # print( + # 'runPLoM._run_simulation: please check if the dakota is installed correctly on the system.' + # ) + + # if not os.path.exists(dakotaTabPath): + # try: + # subprocess.call(command_line) + # except: + # print( + # 'runPLoM._run_simulation: error in running dakota to generate the initial sample.' + # ) + # print( + # 'runPLoM._run_simulation: please check if the dakota is installed correctly on the system.' + # ) + + # if not os.path.exists(dakotaTabPath): + # msg = 'Dakota preprocessor did not run successfully' + # self.errlog.exit(msg) + + print(f"Dakota path: {shutil.which('dakota')}") # noqa: T201 + + completed = None try: - os.system(command_line) # noqa: S605 - except: # noqa: E722 + completed = subprocess.run( # noqa: S603 + command_line, + check=True, # raise if exit code != 0 + capture_output=True, # capture BOTH stdout and stderr + text=True, # return str, not bytes + ) + except subprocess.CalledProcessError as e: print( # noqa: T201 'runPLoM._run_simulation: error in running dakota to generate the initial sample.' ) print( # noqa: T201 'runPLoM._run_simulation: please check if the dakota is installed correctly on the system.' ) - + print(f'Command: {" ".join(e.cmd)}') # noqa: T201 + print(f'Return code: {e.returncode}') # noqa: T201 + if e.stdout: + print('---- STDOUT ----') # noqa: T201 + print(e.stdout) # noqa: T201 + if e.stderr: + print('---- STDERR ----') # noqa: T201 + print(e.stderr) # noqa: T201 + self.errlog.exit('Dakota preprocessor did not run successfully') + + # Verify expected output exists if not os.path.exists(dakotaTabPath): # noqa: PTH110 - try: - subprocess.call(command_line) # noqa: S603 - except: # noqa: E722 - print( # noqa: T201 - 'runPLoM._run_simulation: error in running dakota to generate the initial sample.' - ) - print( # noqa: T201 - 'runPLoM._run_simulation: please check if the dakota is installed correctly on the system.' - ) - - if not os.path.exists(dakotaTabPath): # noqa: PTH110 - msg = 'Dakota preprocessor did not run successfully' - self.errlog.exit(msg) + print( # noqa: T201 + 'runPLoM._run_simulation: Dakota finished without creating the expected output file.' + ) + print(f'Expected file missing: {dakotaTabPath}') # noqa: T201 + if completed is not None: + if completed.stdout: + print('---- STDOUT ----') # noqa: T201 + print(completed.stdout) # noqa: T201 + if completed.stderr: + print('---- STDERR ----') # noqa: T201 + print(completed.stderr) # noqa: T201 + self.errlog.exit('Dakota preprocessor did not run successfully') # remove the new dakota.json # os.remove('sc_dakota_plom.json') From 774d40c41c983ab8ea14c40ccf11fe0bfc7ec18c Mon Sep 17 00:00:00 2001 From: bsaakash <11618528+bsaakash@users.noreply.github.com> Date: Mon, 11 Aug 2025 23:12:20 -0700 Subject: [PATCH 2/8] abs - removing quotes in path to python executable on windows since using subprocess and a list of arguments with shell=False --- modules/performUQ/SimCenterUQ/runPLoM.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py index 382cba000..0c66b538c 100644 --- a/modules/performUQ/SimCenterUQ/runPLoM.py +++ b/modules/performUQ/SimCenterUQ/runPLoM.py @@ -173,7 +173,8 @@ def _run_simulation(self): # noqa: C901 pass else: # pythonEXE = os.path.join(localAppDir,'applications','python','python.exe') - pythonEXE = '"' + sys.executable + '"' # noqa: N806 + # pythonEXE = '"' + sys.executable + '"' + pythonEXE = sys.executable # noqa: N806 else: # for remote run and macOS, let's use system python pass @@ -235,7 +236,7 @@ def _run_simulation(self): # noqa: C901 ] print( # noqa: T201 - 'Command to run dakota:', ' '.join(command_line) + 'Command to run:', ' '.join(command_line) ) # for debugging dakotaTabPath = os.path.join(self.work_dir, 'dakotaTab.out') # noqa: PTH118, N806 From b44e4d2d0c826008eaab11f1ecb6de92893d54c3 Mon Sep 17 00:00:00 2001 From: bsaakash <11618528+bsaakash@users.noreply.github.com> Date: Mon, 11 Aug 2025 23:17:41 -0700 Subject: [PATCH 3/8] abs - removing unneeded print --- modules/performUQ/SimCenterUQ/runPLoM.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py index 0c66b538c..b1276c207 100644 --- a/modules/performUQ/SimCenterUQ/runPLoM.py +++ b/modules/performUQ/SimCenterUQ/runPLoM.py @@ -267,8 +267,6 @@ def _run_simulation(self): # noqa: C901 # msg = 'Dakota preprocessor did not run successfully' # self.errlog.exit(msg) - print(f"Dakota path: {shutil.which('dakota')}") # noqa: T201 - completed = None try: completed = subprocess.run( # noqa: S603 From f5899e93faa54ab339da3ee794d54371838b9b09 Mon Sep 17 00:00:00 2001 From: bsaakash <11618528+bsaakash@users.noreply.github.com> Date: Wed, 20 Aug 2025 15:05:31 -0700 Subject: [PATCH 4/8] abs - improving plom error log - fixes overwriting dakota error messages --- modules/performUQ/SimCenterUQ/runPLoM.py | 25 ++++++++++++++++-------- modules/performUQ/dakota/DakotaUQ.py | 6 +++--- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py index b1276c207..693578dbe 100644 --- a/modules/performUQ/SimCenterUQ/runPLoM.py +++ b/modules/performUQ/SimCenterUQ/runPLoM.py @@ -46,6 +46,8 @@ import shutil import subprocess import sys +from datetime import datetime, timezone +from pathlib import Path import numpy as np import pandas as pd @@ -741,7 +743,7 @@ def _load_hyperparameter(self): if ( self.constraintsFlag ): # sy - added because quoFEM/EE-UQ example failed 09/10/2024 - constr_file = Path(self.constraintsFile).resolve() # noqa: F405 + constr_file = Path(self.constraintsFile).resolve() sys.path.insert(0, str(constr_file.parent) + '/') constr_script = importlib.__import__( # noqa: F405 constr_file.name[:-3], globals(), locals(), [], 0 @@ -750,7 +752,7 @@ def _load_hyperparameter(self): print('beta_c = ', self.beta_c) # noqa: T201 # if smootherKDE if self.smootherKDE_Customize: - kde_file = Path(self.smootherKDE_file).resolve() # noqa: F405 + kde_file = Path(self.smootherKDE_file).resolve() sys.path.insert(0, str(kde_file.parent) + '/') kde_script = importlib.__import__( # noqa: F405 kde_file.name[:-3], globals(), locals(), [], 0 @@ -761,7 +763,7 @@ def _load_hyperparameter(self): print('epsilon_k = ', self.smootherKDE) # noqa: T201 # if tolKDE if self.kdeTolerance_Customize: - beta_file = Path(self.kdeTolerance_file).resolve() # noqa: F405 + beta_file = Path(self.kdeTolerance_file).resolve() sys.path.insert(0, str(beta_file.parent) + '/') beta_script = importlib.__import__( # noqa: F405 beta_file.name[:-3], globals(), locals(), [], 0 @@ -1026,13 +1028,20 @@ def read_txt(text_dir, errlog): # noqa: D103 class errorLog: # noqa: D101 def __init__(self, work_dir): - self.file = open(f'{work_dir}/dakota.err', 'w') # noqa: SIM115, PTH123 + self.path = Path(work_dir) / 'dakota.err' + self.path.parent.mkdir(parents=True, exist_ok=True) # ensure dir exists + self.path.touch(exist_ok=True) # create if missing def exit(self, msg): # noqa: D102 - print(msg) # noqa: T201 - self.file.write(msg) - self.file.close() - exit(-1) # noqa: PLR1722 + print(msg, file=sys.stderr) # also send to stderr # noqa: T201 + try: + with self.path.open('a', encoding='utf-8', errors='replace') as f: + ts = datetime.now(tz=timezone.utc).isoformat(timespec='seconds') + f.write(f'\n\n---- PLoM error @ {ts} ----\n') + f.write(msg.rstrip() + '\n') + except Exception as e: # noqa: BLE001 + print(f'[WARN] Could not write to {self.path}: {e}', file=sys.stderr) # noqa: T201 + sys.exit(-1) # nonzero exit for failure def build_surrogate(work_dir, os_type, run_type, input_file, workflow_driver): diff --git a/modules/performUQ/dakota/DakotaUQ.py b/modules/performUQ/dakota/DakotaUQ.py index d73db4419..1753f0b76 100644 --- a/modules/performUQ/dakota/DakotaUQ.py +++ b/modules/performUQ/dakota/DakotaUQ.py @@ -142,9 +142,9 @@ def main(args): # noqa: C901, D103 returncode = e.returncode # noqa: F841 run_success = False - dakotaErrFile = os.path.join(cwd, 'dakota.err') # noqa: N806, PTH109, PTH118, RUF100, W291 - dakotaOutFile = os.path.join(cwd, 'dakota.out') # noqa: N806, PTH109, PTH118, RUF100 - dakotaTabFile = os.path.join(cwd, 'dakotaTab.out') # noqa: N806, PTH109, PTH118, RUF100 + dakotaErrFile = os.path.join(cwd, 'dakota.err') # noqa: N806, PTH109, PTH118 + dakotaOutFile = os.path.join(cwd, 'dakota.out') # noqa: N806, PTH109, PTH118 + dakotaTabFile = os.path.join(cwd, 'dakotaTab.out') # noqa: N806, PTH109, PTH118 checkErrFile = os.path.exists(dakotaErrFile) # noqa: PTH110, N806 checkOutFile = os.path.exists(dakotaOutFile) # noqa: PTH110, N806 checkTabFile = os.path.exists(dakotaTabFile) # noqa: F841, N806, PTH110 From 6675b3db1639b7b428d059e90b3022be121f2941 Mon Sep 17 00:00:00 2001 From: bsaakash <11618528+bsaakash@users.noreply.github.com> Date: Wed, 20 Aug 2025 16:24:29 -0700 Subject: [PATCH 5/8] abs - fixing slightly misleading error emssage in plom call to dakota --- modules/performUQ/SimCenterUQ/runPLoM.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py index 693578dbe..c5815ac39 100644 --- a/modules/performUQ/SimCenterUQ/runPLoM.py +++ b/modules/performUQ/SimCenterUQ/runPLoM.py @@ -292,7 +292,7 @@ def _run_simulation(self): # noqa: C901 if e.stderr: print('---- STDERR ----') # noqa: T201 print(e.stderr) # noqa: T201 - self.errlog.exit('Dakota preprocessor did not run successfully') + self.errlog.exit('Dakota did not run successfully') # Verify expected output exists if not os.path.exists(dakotaTabPath): # noqa: PTH110 @@ -307,7 +307,7 @@ def _run_simulation(self): # noqa: C901 if completed.stderr: print('---- STDERR ----') # noqa: T201 print(completed.stderr) # noqa: T201 - self.errlog.exit('Dakota preprocessor did not run successfully') + self.errlog.exit('Dakota did not run successfully') # remove the new dakota.json # os.remove('sc_dakota_plom.json') From ee95113015de8705c9f0f7da78173be7f6e6d37d Mon Sep 17 00:00:00 2001 From: bsaakash <11618528+bsaakash@users.noreply.github.com> Date: Wed, 20 Aug 2025 16:33:19 -0700 Subject: [PATCH 6/8] abs - removing mpi-related variables in env for plom process which runs dakota when remote --- modules/performUQ/SimCenterUQ/runPLoM.py | 32 ++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py index c5815ac39..74123f857 100644 --- a/modules/performUQ/SimCenterUQ/runPLoM.py +++ b/modules/performUQ/SimCenterUQ/runPLoM.py @@ -269,13 +269,45 @@ def _run_simulation(self): # noqa: C901 # msg = 'Dakota preprocessor did not run successfully' # self.errlog.exit(msg) + # decide if we're on HPC/remote + is_remote = job_config.get('runType', 'runningLocal') == 'runningRemote' + + # build a sanitized env ONLY for the child process running Dakota on remote + env = os.environ.copy() + removed = [] + if is_remote: + # Strip common MPI/PMI launch-context vars so Dakota won't try MPI_Init via PMI + prefixes = ( + 'PMI_', + 'OMPI_', + 'MPI_', + 'I_MPI_', + 'HYDRA_', + 'SLURM_', + 'PMIX_', + 'MPICH_', + 'UCX_', + 'FI_', + 'PALS_', + ) + for k in list(env): + if k.startswith(prefixes): + removed.append(k) + env.pop(k, None) + if removed: + print( # noqa: T201 + f'sanitized env for dakota run (removed {len(removed)} keys): {removed[:6]} ...' + ) + completed = None try: + # run DakotaUQ with the chosen env completed = subprocess.run( # noqa: S603 command_line, check=True, # raise if exit code != 0 capture_output=True, # capture BOTH stdout and stderr text=True, # return str, not bytes + env=env if is_remote else None, # use sanitized env on remote ) except subprocess.CalledProcessError as e: print( # noqa: T201 From 6ea15c490d9679481fe927233546ace31ef0a8cd Mon Sep 17 00:00:00 2001 From: bsaakash <11618528+bsaakash@users.noreply.github.com> Date: Wed, 20 Aug 2025 17:15:59 -0700 Subject: [PATCH 7/8] abs - post-processing after the dakota simulation when running remotely as well --- modules/performUQ/SimCenterUQ/runPLoM.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py index 74123f857..3ff0423df 100644 --- a/modules/performUQ/SimCenterUQ/runPLoM.py +++ b/modules/performUQ/SimCenterUQ/runPLoM.py @@ -344,7 +344,7 @@ def _run_simulation(self): # noqa: C901 # remove the new dakota.json # os.remove('sc_dakota_plom.json') - if runType in ['run', 'runningLocal']: + if runType in ['run', 'runningLocal', 'runningRemote']: # create the response.csv file from the dakotaTab.out file os.chdir(run_dir) if bldg_id is not None: @@ -367,7 +367,7 @@ def _run_simulation(self): # noqa: C901 ) self.job_config = job_config - elif self.run_type in ['set_up', 'runningRemote']: + elif self.run_type in ['set_up']: pass def _prepare_training_data(self, run_dir): # noqa: C901 From 4a5d02288d96ae02db1f72eedcc687d93ba49c0c Mon Sep 17 00:00:00 2001 From: bsaakash <11618528+bsaakash@users.noreply.github.com> Date: Wed, 20 Aug 2025 17:17:15 -0700 Subject: [PATCH 8/8] abs - not creating err file in init, otherwise dakota will not write to it --- modules/performUQ/SimCenterUQ/runPLoM.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py index 3ff0423df..c50a92419 100644 --- a/modules/performUQ/SimCenterUQ/runPLoM.py +++ b/modules/performUQ/SimCenterUQ/runPLoM.py @@ -1062,7 +1062,6 @@ class errorLog: # noqa: D101 def __init__(self, work_dir): self.path = Path(work_dir) / 'dakota.err' self.path.parent.mkdir(parents=True, exist_ok=True) # ensure dir exists - self.path.touch(exist_ok=True) # create if missing def exit(self, msg): # noqa: D102 print(msg, file=sys.stderr) # also send to stderr # noqa: T201