Skip to content

Commit cafb300

Browse files
Workflows Base Module (#229)
* close #237 * add workflows.base (code for reading CSV file of projects and running automated workflows on each project) * add workflows.registry (with a single example workflow, DihedralAnalysis) * add new testing data, .csv for workflows base module; add workflows to STATES dictionary * add documentation for workflows registry and base module * update CHANGES
1 parent b7d0b06 commit cafb300

File tree

9 files changed

+361
-4
lines changed

9 files changed

+361
-4
lines changed

CHANGES

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,13 @@ Changes
2323

2424
Enhancements
2525

26-
* new workflows module (PR #217)
26+
* new workflows registry that contains each EnsembleAnalysis for which
27+
a workflows module exists, for use with workflows base module (#229)
28+
* new workflows base module that provides iterative workflow use for
29+
directories that contain multiple projects (#229)
30+
* new workflows module (#217)
2731
* new automated dihedral analysis workflow (detect dihedrals with SMARTS,
28-
analyze with EnsembleAnalysis, and generate seaborn violinplots)
29-
PR #217)
32+
analyze with EnsembleAnalysis, and generate seaborn violinplots) (#217)
3033

3134
Fixes
3235

@@ -36,7 +39,7 @@ Fixes
3639
* fix ensemble.EnsembleAnalysis.check_groups_from_common_ensemble (#212)
3740

3841

39-
2021-01-03 0.8.0
42+
2022-01-03 0.8.0
4043
ALescoulie, orbeckst
4144

4245
Changes

doc/sphinx/source/workflows.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,6 @@ for use with :class:`~mdpow.analysis.dihedral.DihedralAnalysis`.
1414
.. toctree::
1515
:maxdepth: 1
1616

17+
workflows/base
18+
workflows/registry
1719
workflows/dihedrals
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
==============
2+
Workflows Base
3+
==============
4+
5+
.. versionadded:: 0.9.0
6+
7+
.. automodule:: mdpow.workflows.base
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
==================
2+
Workflows Registry
3+
==================
4+
5+
.. versionadded:: 0.9.0
6+
7+
.. automodule:: mdpow.workflows.registry

mdpow/tests/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,6 @@
1313
"FEP": RESOURCES.join("states", "FEP"),
1414
"base": RESOURCES.join("states", "base"),
1515
"md_npt": RESOURCES.join("states", "FEP"),
16+
"workflows": RESOURCES.join("states", "workflows"),
1617
}
1718
CONFIGURATIONS = RESOURCES.join("test_configurations")

mdpow/tests/test_workflows_base.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import re
2+
import os
3+
import sys
4+
import yaml
5+
import pybol
6+
import pytest
7+
import pathlib
8+
import logging
9+
10+
import pandas as pd
11+
12+
from . import RESOURCES
13+
from . import STATES
14+
15+
import py.path
16+
17+
from ..workflows import base
18+
19+
from pkg_resources import resource_filename
20+
21+
RESOURCES = pathlib.PurePath(resource_filename(__name__, 'testing_resources'))
22+
MANIFEST = RESOURCES / 'manifest.yml'
23+
24+
@pytest.fixture(scope='function')
25+
def molname_workflows_directory(tmp_path):
26+
m = pybol.Manifest(str(MANIFEST))
27+
m.assemble('workflows', tmp_path)
28+
return tmp_path
29+
30+
class TestWorkflowsBase(object):
31+
32+
@pytest.fixture(scope='function')
33+
def SM_tmp_dir(self, molname_workflows_directory):
34+
dirname = molname_workflows_directory
35+
return dirname
36+
37+
@pytest.fixture(scope='function')
38+
def csv_input_data(self):
39+
csv_path = STATES['workflows'] / 'project_paths.csv'
40+
csv_df = pd.read_csv(csv_path).reset_index(drop=True)
41+
return csv_path, csv_df
42+
43+
@pytest.fixture(scope='function')
44+
def test_df_data(self):
45+
test_dict = {'molecule' : ['SM25', 'SM26'],
46+
'resname' : ['SM25', 'SM26']}
47+
test_df = pd.DataFrame(test_dict).reset_index(drop=True)
48+
return test_df
49+
50+
@pytest.fixture(scope='function')
51+
def project_paths_data(self, SM_tmp_dir):
52+
project_paths = base.project_paths(parent_directory=SM_tmp_dir)
53+
return project_paths
54+
55+
def test_project_paths(self, test_df_data, project_paths_data):
56+
test_df = test_df_data
57+
project_paths = project_paths_data
58+
59+
assert project_paths['molecule'][0] == test_df['molecule'][0]
60+
assert project_paths['molecule'][1] == test_df['molecule'][1]
61+
assert project_paths['resname'][0] == test_df['resname'][0]
62+
assert project_paths['resname'][1] == test_df['resname'][1]
63+
64+
def test_project_paths_csv_input(self, csv_input_data):
65+
csv_path, csv_df = csv_input_data
66+
project_paths = base.project_paths(csv=csv_path)
67+
68+
pd.testing.assert_frame_equal(project_paths, csv_df)
69+
70+
def test_automated_project_analysis(self, project_paths_data, caplog):
71+
project_paths = project_paths_data
72+
# change resname to match topology (every SAMPL7 resname is 'UNK')
73+
# only necessary for this dataset, not necessary for normal use
74+
project_paths['resname'] = 'UNK'
75+
76+
base.automated_project_analysis(project_paths, solvents=('water',),
77+
ensemble_analysis='DihedralAnalysis')
78+
79+
assert 'all analyses completed' in caplog.text, ('automated_dihedral_analysis '
80+
'did not iteratively run to completion for the provided project')
81+
82+
def test_automated_project_analysis_KeyError(self, project_paths_data, caplog):
83+
caplog.clear()
84+
caplog.set_level(logging.ERROR, logger='mdpow.workflows.base')
85+
86+
project_paths = project_paths_data
87+
# change resname to match topology (every SAMPL7 resname is 'UNK')
88+
# only necessary for this dataset, not necessary for normal use
89+
project_paths['resname'] = 'UNK'
90+
91+
# test error output when raised
92+
with pytest.raises(KeyError,
93+
match="Invalid ensemble_analysis 'DarthVaderAnalysis'. "
94+
"An EnsembleAnalysis type that corresponds to an existing "
95+
"automated workflow module must be input as a kwarg. ex: "
96+
"ensemble_analysis='DihedralAnalysis'"):
97+
base.automated_project_analysis(project_paths, ensemble_analysis='DarthVaderAnalysis', solvents=('water',))
98+
99+
# test logger error recording
100+
assert "'DarthVaderAnalysis' is an invalid selection" in caplog.text, ('did not catch incorrect '
101+
'key specification for workflows.registry that results in KeyError')
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
molecule,resname,path
2+
SM25,SM25,mdpow/tests/testing_resources/states/workflows/SM25
3+
SM26,SM26,mdpow/tests/testing_resources/states/workflows/SM26

mdpow/workflows/base.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
# MDPOW: base.py
2+
# 2022 Cade Duckworth
3+
4+
"""
5+
:mod:`mdpow.workflows.base` --- Automated workflow base functions
6+
=================================================================
7+
8+
To analyze multiple MDPOW projects, provide :func:`project_paths`
9+
with the top-level directory containing all MDPOW projects' simulation data
10+
to obtain a :class:`pandas.DataFrame` containing the project information
11+
and paths. Then, :func:`automated_project_analysis` takes as input the
12+
aforementioned :class:`pandas.DataFrame` and runs the specified
13+
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` for all MDPOW projects
14+
under the top-level directory provided to :func:`project_paths`.
15+
16+
.. seealso:: :mod:`~mdpow.workflows.registry`
17+
18+
.. autofunction:: project_paths
19+
.. autofunction:: automated_project_analysis
20+
21+
"""
22+
23+
import os
24+
import re
25+
import pandas as pd
26+
27+
from mdpow.workflows import registry
28+
29+
import logging
30+
31+
logger = logging.getLogger('mdpow.workflows.base')
32+
33+
def project_paths(parent_directory=None, csv=None, csv_save_dir=None):
34+
"""Takes a top directory containing MDPOW projects and determines
35+
the molname, resname, and path, of each MDPOW project within.
36+
37+
Optionally takes a .csv file containing `molname`, `resname`, and
38+
`paths`, in that order.
39+
40+
:keywords:
41+
42+
*parent_directory*
43+
the path for the location of the top directory
44+
under which the subdirectories of MDPOW simulation
45+
data exist, additionally creates a 'project_paths.csv' file
46+
for user manipulation of metadata and for future reference
47+
48+
*csv*
49+
.csv file containing the molecule names, resnames,
50+
and paths, in that order, for the MDPOW simulation
51+
data to be iterated over must contain header of the
52+
form: `molecule,resname,path`
53+
54+
*csv_save_dir*
55+
optionally provided directory to save .csv file, otherwise,
56+
data will be saved in current working directory
57+
58+
:returns:
59+
60+
*project_paths*
61+
:class:`pandas.DataFrame` containing MDPOW project metadata
62+
63+
.. rubric:: Example
64+
65+
Typical Workflow::
66+
67+
project_paths = project_paths(parent_directory='/foo/bar/MDPOW_projects')
68+
automated_project_analysis(project_paths)
69+
70+
or::
71+
72+
project_paths = project_paths(csv='/foo/bar/MDPOW.csv')
73+
automated_project_analysis(project_paths)
74+
75+
"""
76+
77+
if parent_directory is not None:
78+
79+
locations = []
80+
81+
reg_compile = re.compile('FEP')
82+
for dirpath, dirnames, filenames in os.walk(parent_directory):
83+
result = [dirpath.strip() for dirname in dirnames if reg_compile.match(dirname)]
84+
if result:
85+
locations.append(result[0])
86+
87+
resnames = []
88+
89+
for loc in locations:
90+
res_temp = loc.strip().split('/')
91+
resnames.append(res_temp[-1])
92+
93+
project_paths = pd.DataFrame(
94+
{
95+
'molecule': resnames,
96+
'resname': resnames,
97+
'path': locations
98+
}
99+
)
100+
if csv_save_dir is not None:
101+
project_paths.to_csv(f'{csv_save_dir}/project_paths.csv', index=False)
102+
logger.info(f'project_paths saved under {csv_save_dir}')
103+
else:
104+
current_directory = os.getcwd()
105+
project_paths.to_csv('project_paths.csv', index=False)
106+
logger.info(f'project_paths saved under {current_directory}')
107+
108+
elif csv is not None:
109+
locations = pd.read_csv(csv)
110+
project_paths = locations.sort_values(by=['molecule', 'resname', 'path']).reset_index(drop=True)
111+
112+
return project_paths
113+
114+
def automated_project_analysis(project_paths, ensemble_analysis, **kwargs):
115+
"""Takes a :class:`pandas.DataFrame` created by :func:`~mdpow.workflows.base.project_paths`
116+
and iteratively runs the specified :class:`~mdpow.analysis.ensemble.EnsembleAnalysis`
117+
for each of the projects by running the associated automated workflow
118+
in each project directory returned by :func:`~mdpow.workflows.base.project_paths`.
119+
120+
Compatibility with more automated analyses in development.
121+
122+
:keywords:
123+
124+
*project_paths*
125+
:class:`pandas.DataFrame` that provides paths to MDPOW projects
126+
127+
*ensemble_analysis*
128+
name of the :class:`~mdpow.analysis.ensemble.EnsembleAnalysis`
129+
that corresponds to the desired automated workflow module
130+
131+
*kwargs*
132+
keyword arguments for the supported automated workflows,
133+
see the :mod:`~mdpow.workflows.registry` for all available
134+
workflows and their call signatures
135+
136+
.. rubric:: Example
137+
138+
A typical workflow is the automated dihedral analysis from
139+
:mod:`mdpow.workflows.dihedrals`, which applies the *ensemble analysis*
140+
:class:`~mdpow.analysis.dihedral.DihedralAnalysis` to each project.
141+
The :data:`~mdpow.workflows.registry.registry` contains this automated
142+
workflow under the key *"DihedralAnalysis"* and so the automated execution
143+
for all `project_paths` (obtained via :func:`project_paths`) is performed by
144+
passing the specific key to :func:`automated_project_analysis`::
145+
146+
project_paths = project_paths(parent_directory='/foo/bar/MDPOW_projects')
147+
automated_project_analysis(project_paths, ensemble_analysis='DihedralAnalysis', **kwargs)
148+
149+
"""
150+
151+
for row in project_paths.itertuples():
152+
molname = row.molecule
153+
resname = row.resname
154+
dirname = row.path
155+
156+
logger.info(f'starting {molname}')
157+
158+
try:
159+
registry.registry[ensemble_analysis](dirname=dirname, resname=resname, molname=molname, **kwargs)
160+
161+
logger.info(f'{molname} completed')
162+
163+
except KeyError as err:
164+
msg = (f"Invalid ensemble_analysis {err}. An EnsembleAnalysis type that corresponds "
165+
"to an existing automated workflow module must be input as a kwarg. "
166+
"ex: ensemble_analysis='DihedralAnalysis'")
167+
logger.error(f'{err} is an invalid selection')
168+
169+
raise KeyError(msg)
170+
171+
except TypeError as err:
172+
msg = (f"Invalid ensemble_analysis {ensemble_analysis}. An EnsembleAnalysis type that "
173+
"corresponds to an existing automated workflow module must be input as a kwarg. "
174+
"ex: ensemble_analysis='DihedralAnalysis'")
175+
logger.error(f'workflow module for {ensemble_analysis} does not exist yet')
176+
177+
raise TypeError(msg)
178+
179+
logger.info('all analyses completed')
180+
return

mdpow/workflows/registry.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# MDPOW: registry.py
2+
# 2023 Cade Duckworth
3+
4+
"""
5+
:mod:`mdpow.workflows.registry` --- Registry of currently supported automated workflows
6+
=======================================================================================
7+
8+
The :mod:`mdpow.workflows.registry` module hosts a dictionary with keys that correspond to an
9+
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` for which exists a corresponding automated workflow.
10+
11+
.. table:: Currently supported automated workflows.
12+
:widths: auto
13+
:name: workflows_registry
14+
15+
+-------------------------------+------------------------------------------------------------------------------------------------------+
16+
| key/keyword: EnsembleAnalysis | value: <workflow module>.<top-level automated analysis function> |
17+
+===============================+======================================================================================================+
18+
| DihedralAnalysis | :any:`dihedrals.automated_dihedral_analysis <mdpow.workflows.dihedrals.automated_dihedral_analysis>` |
19+
+-------------------------------+------------------------------------------------------------------------------------------------------+
20+
21+
.. autodata:: registry
22+
23+
.. seealso:: :mod:`~mdpow.workflows.base`
24+
25+
"""
26+
27+
# import analysis
28+
from mdpow.workflows import dihedrals
29+
30+
registry = {
31+
32+
'DihedralAnalysis' : dihedrals.automated_dihedral_analysis
33+
34+
}
35+
36+
"""
37+
In the `registry`, each entry corresponds to an
38+
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis`
39+
for which exists a corresponding automated workflow.
40+
41+
Intended for use with :mod:`mdpow.workflows.base` to specify which
42+
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` should run iteratively over
43+
the provided project data directory.
44+
45+
To include a new automated workflow for use with :mod:`mdpow.workflows.base`,
46+
create a key that is the name of the corresponding
47+
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis`, with the value defined as
48+
`<workflow module>.<top-level automated analysis function>`.
49+
50+
The available automated workflows (key-value pairs) are listed in the
51+
following table :any:`Currently supported automated workflows. <workflows_registry>`
52+
53+
"""

0 commit comments

Comments
 (0)