From eec7c86ddd3d58ef50a07e64fd1e83e799dba9a0 Mon Sep 17 00:00:00 2001
From: bsaakash <11618528+bsaakash@users.noreply.github.com>
Date: Mon, 11 Aug 2025 22:43:37 -0700
Subject: [PATCH 1/8] abs - improving the dakota call in plom and showing more
 descriptive error messages

---
 modules/performUQ/SimCenterUQ/runPLoM.py | 99 +++++++++++++++++++-----
 1 file changed, 80 insertions(+), 19 deletions(-)

diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py
index c9c94a52a..382cba000 100644
--- a/modules/performUQ/SimCenterUQ/runPLoM.py
+++ b/modules/performUQ/SimCenterUQ/runPLoM.py
@@ -187,6 +187,7 @@ def _run_simulation(self):  # noqa: C901
         if bldg_id is not None:
             os.chdir(bldg_id)
         os.chdir('templatedir')
+        print(f'Current working directory: {os.getcwd()}')  # noqa: PTH109, T201
 
         # dakota script path
         dakotaScript = os.path.join(  # noqa: PTH118, N806
@@ -216,36 +217,96 @@ def _run_simulation(self):  # noqa: C901
         # command line
         # KZ modified 0331
         # command_line = f'{pythonEXE} {dakotaScript} --workflowInput sc_dakota_plom.json --driverFile {os.path.splitext(self.workflow_driver)[0]} --workflowOutput EDP.json --runType {runType}'
-        command_line = f'{pythonEXE} {dakotaScript} --workflowInput sc_dakota_plom.json --driverFile {os.path.splitext(self.workflow_driver)[0]} --workflowOutput EDP.json --runType runningLocal'  # noqa: PTH122
-        print(command_line)  # noqa: T201
-        # run command
+        # command_line = f'{pythonEXE} {dakotaScript} --workflowInput sc_dakota_plom.json --driverFile {os.path.splitext(self.workflow_driver)[0]} --workflowOutput EDP.json --runType runningLocal'
+        # print(command_line)
+
+        # Build the command as a list (no shell interpretation)
+        command_line = [
+            pythonEXE,
+            dakotaScript,
+            '--workflowInput',
+            'sc_dakota_plom.json',
+            '--driverFile',
+            os.path.splitext(self.workflow_driver)[0],  # noqa: PTH122
+            '--workflowOutput',
+            'EDP.json',
+            '--runType',
+            'runningLocal',
+        ]
+
+        print(  # noqa: T201
+            'Command to run dakota:', ' '.join(command_line)
+        )  # for debugging
+
         dakotaTabPath = os.path.join(self.work_dir, 'dakotaTab.out')  # noqa: PTH118, N806
         print(dakotaTabPath)  # noqa: T201
 
+        # try:
+        #     os.system(command_line)
+        # except:
+        #     print(
+        #         'runPLoM._run_simulation: error in running dakota to generate the initial sample.'
+        #     )
+        #     print(
+        #         'runPLoM._run_simulation: please check if the dakota is installed correctly on the system.'
+        #     )
+
+        # if not os.path.exists(dakotaTabPath):
+        #     try:
+        #         subprocess.call(command_line)
+        #     except:
+        #         print(
+        #             'runPLoM._run_simulation: error in running dakota to generate the initial sample.'
+        #         )
+        #         print(
+        #             'runPLoM._run_simulation: please check if the dakota is installed correctly on the system.'
+        #         )
+
+        # if not os.path.exists(dakotaTabPath):
+        #     msg = 'Dakota preprocessor did not run successfully'
+        #     self.errlog.exit(msg)
+
+        print(f"Dakota path: {shutil.which('dakota')}")  # noqa: T201
+
+        completed = None
         try:
-            os.system(command_line)  # noqa: S605
-        except:  # noqa: E722
+            completed = subprocess.run(  # noqa: S603
+                command_line,
+                check=True,  # raise if exit code != 0
+                capture_output=True,  # capture BOTH stdout and stderr
+                text=True,  # return str, not bytes
+            )
+        except subprocess.CalledProcessError as e:
             print(  # noqa: T201
                 'runPLoM._run_simulation: error in running dakota to generate the initial sample.'
             )
             print(  # noqa: T201
                 'runPLoM._run_simulation: please check if the dakota is installed correctly on the system.'
             )
-
+            print(f'Command: {" ".join(e.cmd)}')  # noqa: T201
+            print(f'Return code: {e.returncode}')  # noqa: T201
+            if e.stdout:
+                print('---- STDOUT ----')  # noqa: T201
+                print(e.stdout)  # noqa: T201
+            if e.stderr:
+                print('---- STDERR ----')  # noqa: T201
+                print(e.stderr)  # noqa: T201
+            self.errlog.exit('Dakota preprocessor did not run successfully')
+
+        # Verify expected output exists
         if not os.path.exists(dakotaTabPath):  # noqa: PTH110
-            try:
-                subprocess.call(command_line)  # noqa: S603
-            except:  # noqa: E722
-                print(  # noqa: T201
-                    'runPLoM._run_simulation: error in running dakota to generate the initial sample.'
-                )
-                print(  # noqa: T201
-                    'runPLoM._run_simulation: please check if the dakota is installed correctly on the system.'
-                )
-
-        if not os.path.exists(dakotaTabPath):  # noqa: PTH110
-            msg = 'Dakota preprocessor did not run successfully'
-            self.errlog.exit(msg)
+            print(  # noqa: T201
+                'runPLoM._run_simulation: Dakota finished without creating the expected output file.'
+            )
+            print(f'Expected file missing: {dakotaTabPath}')  # noqa: T201
+            if completed is not None:
+                if completed.stdout:
+                    print('---- STDOUT ----')  # noqa: T201
+                    print(completed.stdout)  # noqa: T201
+                if completed.stderr:
+                    print('---- STDERR ----')  # noqa: T201
+                    print(completed.stderr)  # noqa: T201
+            self.errlog.exit('Dakota preprocessor did not run successfully')
 
         # remove the new dakota.json
         # os.remove('sc_dakota_plom.json')

From 774d40c41c983ab8ea14c40ccf11fe0bfc7ec18c Mon Sep 17 00:00:00 2001
From: bsaakash <11618528+bsaakash@users.noreply.github.com>
Date: Mon, 11 Aug 2025 23:12:20 -0700
Subject: [PATCH 2/8] abs - removing quotes in path to python executable on
 windows since using subprocess and a list of arguments with shell=False

---
 modules/performUQ/SimCenterUQ/runPLoM.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py
index 382cba000..0c66b538c 100644
--- a/modules/performUQ/SimCenterUQ/runPLoM.py
+++ b/modules/performUQ/SimCenterUQ/runPLoM.py
@@ -173,7 +173,8 @@ def _run_simulation(self):  # noqa: C901
                 pass
             else:
                 # pythonEXE = os.path.join(localAppDir,'applications','python','python.exe')
-                pythonEXE = '"' + sys.executable + '"'  # noqa: N806
+                # pythonEXE = '"' + sys.executable + '"'
+                pythonEXE = sys.executable  # noqa: N806
         else:
             # for remote run and macOS, let's use system python
             pass
@@ -235,7 +236,7 @@ def _run_simulation(self):  # noqa: C901
         ]
 
         print(  # noqa: T201
-            'Command to run dakota:', ' '.join(command_line)
+            'Command to run:', ' '.join(command_line)
         )  # for debugging
 
         dakotaTabPath = os.path.join(self.work_dir, 'dakotaTab.out')  # noqa: PTH118, N806

From b44e4d2d0c826008eaab11f1ecb6de92893d54c3 Mon Sep 17 00:00:00 2001
From: bsaakash <11618528+bsaakash@users.noreply.github.com>
Date: Mon, 11 Aug 2025 23:17:41 -0700
Subject: [PATCH 3/8] abs - removing unneeded print

---
 modules/performUQ/SimCenterUQ/runPLoM.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py
index 0c66b538c..b1276c207 100644
--- a/modules/performUQ/SimCenterUQ/runPLoM.py
+++ b/modules/performUQ/SimCenterUQ/runPLoM.py
@@ -267,8 +267,6 @@ def _run_simulation(self):  # noqa: C901
         #     msg = 'Dakota preprocessor did not run successfully'
         #     self.errlog.exit(msg)
 
-        print(f"Dakota path: {shutil.which('dakota')}")  # noqa: T201
-
         completed = None
         try:
             completed = subprocess.run(  # noqa: S603

From f5899e93faa54ab339da3ee794d54371838b9b09 Mon Sep 17 00:00:00 2001
From: bsaakash <11618528+bsaakash@users.noreply.github.com>
Date: Wed, 20 Aug 2025 15:05:31 -0700
Subject: [PATCH 4/8] abs - improving plom error log - fixes overwriting dakota
 error messages

---
 modules/performUQ/SimCenterUQ/runPLoM.py | 25 ++++++++++++++++--------
 modules/performUQ/dakota/DakotaUQ.py     |  6 +++---
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py
index b1276c207..693578dbe 100644
--- a/modules/performUQ/SimCenterUQ/runPLoM.py
+++ b/modules/performUQ/SimCenterUQ/runPLoM.py
@@ -46,6 +46,8 @@
 import shutil
 import subprocess
 import sys
+from datetime import datetime, timezone
+from pathlib import Path
 
 import numpy as np
 import pandas as pd
@@ -741,7 +743,7 @@ def _load_hyperparameter(self):
             if (
                 self.constraintsFlag
             ):  # sy - added because quoFEM/EE-UQ example failed 09/10/2024
-                constr_file = Path(self.constraintsFile).resolve()  # noqa: F405
+                constr_file = Path(self.constraintsFile).resolve()
                 sys.path.insert(0, str(constr_file.parent) + '/')
                 constr_script = importlib.__import__(  # noqa: F405
                     constr_file.name[:-3], globals(), locals(), [], 0
@@ -750,7 +752,7 @@ def _load_hyperparameter(self):
                 print('beta_c = ', self.beta_c)  # noqa: T201
                 # if smootherKDE
             if self.smootherKDE_Customize:
-                kde_file = Path(self.smootherKDE_file).resolve()  # noqa: F405
+                kde_file = Path(self.smootherKDE_file).resolve()
                 sys.path.insert(0, str(kde_file.parent) + '/')
                 kde_script = importlib.__import__(  # noqa: F405
                     kde_file.name[:-3], globals(), locals(), [], 0
@@ -761,7 +763,7 @@ def _load_hyperparameter(self):
                 print('epsilon_k = ', self.smootherKDE)  # noqa: T201
             # if tolKDE
             if self.kdeTolerance_Customize:
-                beta_file = Path(self.kdeTolerance_file).resolve()  # noqa: F405
+                beta_file = Path(self.kdeTolerance_file).resolve()
                 sys.path.insert(0, str(beta_file.parent) + '/')
                 beta_script = importlib.__import__(  # noqa: F405
                     beta_file.name[:-3], globals(), locals(), [], 0
@@ -1026,13 +1028,20 @@ def read_txt(text_dir, errlog):  # noqa: D103
 
 class errorLog:  # noqa: D101
     def __init__(self, work_dir):
-        self.file = open(f'{work_dir}/dakota.err', 'w')  # noqa: SIM115, PTH123
+        self.path = Path(work_dir) / 'dakota.err'
+        self.path.parent.mkdir(parents=True, exist_ok=True)  # ensure dir exists
+        self.path.touch(exist_ok=True)  # create if missing
 
     def exit(self, msg):  # noqa: D102
-        print(msg)  # noqa: T201
-        self.file.write(msg)
-        self.file.close()
-        exit(-1)  # noqa: PLR1722
+        print(msg, file=sys.stderr)  # also send to stderr  # noqa: T201
+        try:
+            with self.path.open('a', encoding='utf-8', errors='replace') as f:
+                ts = datetime.now(tz=timezone.utc).isoformat(timespec='seconds')
+                f.write(f'\n\n---- PLoM error @ {ts} ----\n')
+                f.write(msg.rstrip() + '\n')
+        except Exception as e:  # noqa: BLE001
+            print(f'[WARN] Could not write to {self.path}: {e}', file=sys.stderr)  # noqa: T201
+        sys.exit(-1)  # nonzero exit for failure
 
 
 def build_surrogate(work_dir, os_type, run_type, input_file, workflow_driver):
diff --git a/modules/performUQ/dakota/DakotaUQ.py b/modules/performUQ/dakota/DakotaUQ.py
index d73db4419..1753f0b76 100644
--- a/modules/performUQ/dakota/DakotaUQ.py
+++ b/modules/performUQ/dakota/DakotaUQ.py
@@ -142,9 +142,9 @@ def main(args):  # noqa: C901, D103
             returncode = e.returncode  # noqa: F841
             run_success = False
 
-        dakotaErrFile = os.path.join(cwd, 'dakota.err')  # noqa: N806, PTH109, PTH118, RUF100, W291
-        dakotaOutFile = os.path.join(cwd, 'dakota.out')  # noqa: N806, PTH109, PTH118, RUF100
-        dakotaTabFile = os.path.join(cwd, 'dakotaTab.out')  # noqa: N806, PTH109, PTH118, RUF100
+        dakotaErrFile = os.path.join(cwd, 'dakota.err')  # noqa: N806, PTH109, PTH118
+        dakotaOutFile = os.path.join(cwd, 'dakota.out')  # noqa: N806, PTH109, PTH118
+        dakotaTabFile = os.path.join(cwd, 'dakotaTab.out')  # noqa: N806, PTH109, PTH118
         checkErrFile = os.path.exists(dakotaErrFile)  # noqa: PTH110, N806
         checkOutFile = os.path.exists(dakotaOutFile)  # noqa: PTH110, N806
         checkTabFile = os.path.exists(dakotaTabFile)  # noqa: F841, N806, PTH110

From 6675b3db1639b7b428d059e90b3022be121f2941 Mon Sep 17 00:00:00 2001
From: bsaakash <11618528+bsaakash@users.noreply.github.com>
Date: Wed, 20 Aug 2025 16:24:29 -0700
Subject: [PATCH 5/8] abs - fixing slightly misleading error emssage in plom
 call to dakota

---
 modules/performUQ/SimCenterUQ/runPLoM.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py
index 693578dbe..c5815ac39 100644
--- a/modules/performUQ/SimCenterUQ/runPLoM.py
+++ b/modules/performUQ/SimCenterUQ/runPLoM.py
@@ -292,7 +292,7 @@ def _run_simulation(self):  # noqa: C901
             if e.stderr:
                 print('---- STDERR ----')  # noqa: T201
                 print(e.stderr)  # noqa: T201
-            self.errlog.exit('Dakota preprocessor did not run successfully')
+            self.errlog.exit('Dakota did not run successfully')
 
         # Verify expected output exists
         if not os.path.exists(dakotaTabPath):  # noqa: PTH110
@@ -307,7 +307,7 @@ def _run_simulation(self):  # noqa: C901
                 if completed.stderr:
                     print('---- STDERR ----')  # noqa: T201
                     print(completed.stderr)  # noqa: T201
-            self.errlog.exit('Dakota preprocessor did not run successfully')
+            self.errlog.exit('Dakota did not run successfully')
 
         # remove the new dakota.json
         # os.remove('sc_dakota_plom.json')

From ee95113015de8705c9f0f7da78173be7f6e6d37d Mon Sep 17 00:00:00 2001
From: bsaakash <11618528+bsaakash@users.noreply.github.com>
Date: Wed, 20 Aug 2025 16:33:19 -0700
Subject: [PATCH 6/8] abs - removing mpi-related variables in env for plom
 process which runs dakota when remote

---
 modules/performUQ/SimCenterUQ/runPLoM.py | 32 ++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py
index c5815ac39..74123f857 100644
--- a/modules/performUQ/SimCenterUQ/runPLoM.py
+++ b/modules/performUQ/SimCenterUQ/runPLoM.py
@@ -269,13 +269,45 @@ def _run_simulation(self):  # noqa: C901
         #     msg = 'Dakota preprocessor did not run successfully'
         #     self.errlog.exit(msg)
 
+        # decide if we're on HPC/remote
+        is_remote = job_config.get('runType', 'runningLocal') == 'runningRemote'
+
+        # build a sanitized env ONLY for the child process running Dakota on remote
+        env = os.environ.copy()
+        removed = []
+        if is_remote:
+            # Strip common MPI/PMI launch-context vars so Dakota won't try MPI_Init via PMI
+            prefixes = (
+                'PMI_',
+                'OMPI_',
+                'MPI_',
+                'I_MPI_',
+                'HYDRA_',
+                'SLURM_',
+                'PMIX_',
+                'MPICH_',
+                'UCX_',
+                'FI_',
+                'PALS_',
+            )
+            for k in list(env):
+                if k.startswith(prefixes):
+                    removed.append(k)
+                    env.pop(k, None)
+            if removed:
+                print(  # noqa: T201
+                    f'sanitized env for dakota run (removed {len(removed)} keys): {removed[:6]} ...'
+                )
+
         completed = None
         try:
+            # run DakotaUQ with the chosen env
             completed = subprocess.run(  # noqa: S603
                 command_line,
                 check=True,  # raise if exit code != 0
                 capture_output=True,  # capture BOTH stdout and stderr
                 text=True,  # return str, not bytes
+                env=env if is_remote else None,  # use sanitized env on remote
             )
         except subprocess.CalledProcessError as e:
             print(  # noqa: T201

From 6ea15c490d9679481fe927233546ace31ef0a8cd Mon Sep 17 00:00:00 2001
From: bsaakash <11618528+bsaakash@users.noreply.github.com>
Date: Wed, 20 Aug 2025 17:15:59 -0700
Subject: [PATCH 7/8] abs - post-processing after the dakota simulation when
 running remotely as well

---
 modules/performUQ/SimCenterUQ/runPLoM.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py
index 74123f857..3ff0423df 100644
--- a/modules/performUQ/SimCenterUQ/runPLoM.py
+++ b/modules/performUQ/SimCenterUQ/runPLoM.py
@@ -344,7 +344,7 @@ def _run_simulation(self):  # noqa: C901
         # remove the new dakota.json
         # os.remove('sc_dakota_plom.json')
 
-        if runType in ['run', 'runningLocal']:
+        if runType in ['run', 'runningLocal', 'runningRemote']:
             # create the response.csv file from the dakotaTab.out file
             os.chdir(run_dir)
             if bldg_id is not None:
@@ -367,7 +367,7 @@ def _run_simulation(self):  # noqa: C901
                     )
             self.job_config = job_config
 
-        elif self.run_type in ['set_up', 'runningRemote']:
+        elif self.run_type in ['set_up']:
             pass
 
     def _prepare_training_data(self, run_dir):  # noqa: C901

From 4a5d02288d96ae02db1f72eedcc687d93ba49c0c Mon Sep 17 00:00:00 2001
From: bsaakash <11618528+bsaakash@users.noreply.github.com>
Date: Wed, 20 Aug 2025 17:17:15 -0700
Subject: [PATCH 8/8] abs - not creating err file in init, otherwise dakota
 will not write to it

---
 modules/performUQ/SimCenterUQ/runPLoM.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/performUQ/SimCenterUQ/runPLoM.py b/modules/performUQ/SimCenterUQ/runPLoM.py
index 3ff0423df..c50a92419 100644
--- a/modules/performUQ/SimCenterUQ/runPLoM.py
+++ b/modules/performUQ/SimCenterUQ/runPLoM.py
@@ -1062,7 +1062,6 @@ class errorLog:  # noqa: D101
     def __init__(self, work_dir):
         self.path = Path(work_dir) / 'dakota.err'
         self.path.parent.mkdir(parents=True, exist_ok=True)  # ensure dir exists
-        self.path.touch(exist_ok=True)  # create if missing
 
     def exit(self, msg):  # noqa: D102
         print(msg, file=sys.stderr)  # also send to stderr  # noqa: T201