Skip to content

Commit 56aa56a

Browse files
committed
Cleaning up commits
1 parent eb94c1f commit 56aa56a

File tree

3 files changed

+590
-1
lines changed

3 files changed

+590
-1
lines changed

src/access/profiling/__init__.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,14 @@
1313

1414
from access.profiling.parser import ProfilingParser
1515
from access.profiling.fms_parser import FMSProfilingParser
16+
from access.profiling.um_parser import UMProfilingParser
1617
from access.profiling.payujson_parser import PayuJSONProfilingParser
1718
from access.profiling.cice5_parser import CICE5ProfilingParser
1819

19-
__all__ = ["ProfilingParser", "FMSProfilingParser", "PayuJSONProfilingParser", "CICE5ProfilingParser"]
20+
__all__ = [
21+
"ProfilingParser",
22+
"FMSProfilingParser",
23+
"UMProfilingParser",
24+
"CICE5ProfilingParser",
25+
"PayuJSONProfilingParser",
26+
]

src/access/profiling/um_parser.py

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
# Copyright 2025 ACCESS-NRI and contributors. See the top-level COPYRIGHT file for details.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Parser for UM profiling data.
5+
This routine parses the inclusive timers from the UM output log
6+
(e.g. ``atm.fort6.pe0`` for UM7) and returns a dictionary of the
7+
profiling data. Since UM7 and UM13 provides multiple sections with timer
8+
output - we have chosen to use the 'Wallclock times' sub-section
9+
within the Inclusive Timer Summary section.
10+
11+
The profiling data is assumed to have the following
12+
format:
13+
14+
```
15+
MPP : Inclusive timer summary
16+
17+
WALLCLOCK TIMES
18+
<N> ROUTINE MEAN MEDIAN SD % of mean MAX (PE) MIN (PE)
19+
1 AS3 Atmos_Phys2 1308.30 1308.30 0.02 0.00% 1308.33 ( 118) 1308.26 ( 221)
20+
2 AP2 Boundary Layer 956.50 956.13 3.26 0.34% 981.27 ( 136) 953.28 ( 43)
21+
3 AS5-8 Updates 884.62 885.52 2.89 0.33% 889.49 ( 48) 879.36 ( 212)
22+
23+
...
24+
25+
CPU TIMES (sorted by wallclock times)
26+
<N> ROUTINE MEAN MEDIAN SD % of mean MAX (PE) MIN (PE)
27+
...
28+
29+
```
30+
31+
All columns in the first sub-section, except for the numeric index and the `% of mean`, are parsed and returned.
32+
For UM versions 13.x, there is an extra 'N' column name that appears to the left of 'ROUTINE'; this 'N' is
33+
not present in the output from UM v7.x .
34+
35+
"""
36+
37+
from access.profiling.parser import ProfilingParser, _convert_from_string
38+
39+
import re
40+
import logging
41+
42+
logger = logging.getLogger(__name__)
43+
44+
45+
class UMProfilingParser(ProfilingParser):
46+
"""UM profiling output parser."""
47+
48+
def __init__(self):
49+
"""Instantiate the UM profiling parser."""
50+
super().__init__()
51+
52+
# The parsed column names that will be kept. The order needs to match
53+
# the order of the column names in the input data (defined as ``raw_headers``
54+
# in the ``read``` method), after discarding the ignored columns.
55+
self._metrics = ["tavg", "tmed", "tstd", "tmax", "pemax", "tmin", "pemin"]
56+
57+
@property
58+
def metrics(self) -> list:
59+
""" "Return a list of metrics (i.e., columns in the profiling data)"""
60+
return self._metrics
61+
62+
def read(self, stream: str) -> dict:
63+
"""Parse UM profiling data from a string.
64+
65+
Args:
66+
stream (str): input string to parse.
67+
68+
Returns:
69+
stats (dict): dictionary of parsed profiling data.
70+
Ignores two columns ``N``, and ``% over mean`` columns.
71+
72+
To keep consistent column names across all parsers, the following
73+
mapping is used:
74+
================== ==================
75+
UM column name Standard column name
76+
================== ==================
77+
N - (ignored)
78+
ROUTINE region
79+
MEAN tavg
80+
MEDIAN tmed
81+
SD tstd
82+
% of mean - (ignored)
83+
MAX tmax
84+
(PE) pemax
85+
MIN tmin
86+
(PE) pemin
87+
================== ==================
88+
Each key returns a list of values, one for each region. For
89+
example, if there are 20 regions, ``stats['tavg']`` will
90+
return a list with 20 values, one each for each of the regions.
91+
92+
The assumption is that people will want to look at the same metric
93+
for *all* regions at a time; if you want to look at all metrics for
94+
a single region, then you will have to first find the index for the
95+
``region``, and then extract that index from *each* of the 'metric'
96+
lists.
97+
98+
Any number of column headers can be present at the beginning, i.e., before
99+
``ROUTINE`` (including ``N`` for UM v13+) and will be ignored when the
100+
header is being parsed. Such columns must contain integer values for the
101+
profiling information parsing to work, otherwise (e.g., new columns with float
102+
data), an ``AssertionError`` is raised.
103+
104+
Parsing will fail if there are any unexpected columns at the end, either
105+
at the end of the expected header, or the end of the expected columns with
106+
profiling data (regardless of type of the column value)
107+
108+
Raises:
109+
ValueError: If the UM version number can not be found in the input string data.
110+
ValueError: If a match for any of header, footer or section (i.e., empty section)
111+
is not found.
112+
AssertionError: If the expected format is not found in *all* of the lines within the
113+
profiling section.
114+
"""
115+
116+
# First create the local variable with the metrics list
117+
metrics = self.metrics
118+
raw_headers = ["ROUTINE", "MEAN", "MEDIAN", "SD", r"\% of mean", "MAX", r"\(PE\)", "MIN", r"\(PE\)"]
119+
120+
header = r"MPP : Inclusive timer summary\s+WALLCLOCK TIMES\s*"
121+
# UM 13 has an extra header 'N' for the numeric row index (that UM7 does not)
122+
# Writing the pattern this way avoids having to code in UM version dependent patterns
123+
header += r"\S*\s+"
124+
# Then skip over white-space-separated header names.
125+
header += r"\s*".join(raw_headers) + r"\s*"
126+
header_pattern = re.compile(header, re.MULTILINE)
127+
header_match = header_pattern.search(stream)
128+
if not header_match:
129+
logger.debug("Header pattern: %s", header)
130+
logger.debug("Input string: %s", stream)
131+
raise ValueError("No matching header found.")
132+
logger.debug("Found header: %s", header_match.group(0))
133+
134+
# This line (and any preceeding whitespace) indicates
135+
# the end of the profiling data that we want to parse
136+
footer = r"CPU TIMES \(sorted by wallclock times\)\s*"
137+
footer_pattern = re.compile(footer, re.MULTILINE)
138+
footer_match = footer_pattern.search(stream)
139+
if not footer_match:
140+
logger.debug("Footer pattern: %s", footer)
141+
logger.debug("Input string: %s", stream)
142+
raise ValueError("No matching footer found.")
143+
logger.debug("Found footer: %s", footer_match.group(0))
144+
145+
# Match *everything* between the header and footer (the match could be 0 characters)
146+
profiling_section_p = re.compile(header + r"(.*)" + footer, re.MULTILINE | re.DOTALL)
147+
profiling_section = profiling_section_p.search(stream)
148+
149+
profiling_section = profiling_section.group(1)
150+
logger.debug("Found section: %s", profiling_section)
151+
152+
# This is regex dark arts - seems to work, I roughly understood when I
153+
# was refining this named capture group, but I might not be able to in
154+
# the future. Made heavy use of the regex debugger at regex101.com :) - MS 19/9/2025
155+
profile_line = r"^\s*[\d\s]+\s+(?P<region>[a-zA-Z][a-zA-Z:()_/\-*&0-9\s\.]+(?<!\s))"
156+
for metric in metrics:
157+
logger.debug(f"Adding {metric=}")
158+
if metric in ["pemax", "pemin"]:
159+
# the pemax and pemin values are enclosed within brackets '()',
160+
# so we need to ignore both the opening and closing brackets
161+
add_pattern = r"\s+\(\s*(?P<" + metric + r">[0-9.]+)\s*\)"
162+
elif metric == "tstd":
163+
add_pattern = (
164+
r"\s+(?P<" + metric + r">[0-9.]+)\s+[\S]+"
165+
) # SD is followed by % of mean -> ignore that column
166+
else:
167+
add_pattern = (
168+
r"\s+(?P<" + metric + r">[0-9.]+)"
169+
) # standard white-space followed by a sequence of digits or '.'
170+
171+
logger.debug(f"{add_pattern=} for {metric=}")
172+
profile_line += add_pattern
173+
logger.debug(f"{profile_line=} after {metric=}")
174+
175+
profile_line += r"$" # the regex should match till the end of line.
176+
profiling_region_p = re.compile(profile_line, re.MULTILINE)
177+
178+
stats = {"region": []}
179+
stats.update(dict(zip(metrics, [[] for _ in metrics])))
180+
for line in profiling_region_p.finditer(profiling_section):
181+
logger.debug(f"Matched line: {line.group(0)}")
182+
stats["region"].append(line.group("region"))
183+
for metric in metrics:
184+
stats[str(metric)].append(_convert_from_string(line.group(metric)))
185+
186+
# Parsing is done - let's run some checks
187+
num_lines = len(profiling_section.strip().split("\n"))
188+
logger.debug(f"Found {num_lines} lines in profiling section")
189+
if len(stats["region"]) != num_lines:
190+
raise AssertionError(f"Expected {num_lines} regions, found {len(stats['region'])}.")
191+
192+
logger.info(f"Found {len(stats['region'])} regions with profiling info")
193+
return stats

0 commit comments

Comments
 (0)