Skip to content

Commit 460f7c2

Browse files
authored
Use proper DCGM metric for uncorrectable errors (#341)
1 parent b9d1053 commit 460f7c2

File tree

2 files changed

+2
-2
lines changed

2 files changed

+2
-2
lines changed

cmd/do-agent/aggregation.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ var gpuAggregationSpec = map[string][]string{
125125
"dcgm_fi_dev_power_usage": nvidiaAggregatedLabels,
126126
"dcgm_fi_dev_power_violation": nvidiaAggregatedLabels,
127127
"dcgm_fi_dev_thermal_violation": nvidiaAggregatedLabels,
128-
"dcgm_fi_dev_ecc_sbe_agg_total": nvidiaAggregatedLabels,
128+
"dcgm_fi_dev_ecc_dbe_agg_total": nvidiaAggregatedLabels,
129129

130130
// GPU Utilization metrics
131131
"amd_gpu_prof_gui_util_percent": amdAggregatedLabels,

cmd/do-agent/whitelist.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ var gpuWhitelist = map[string]bool{
179179
"DCGM_FI_DEV_POWER_USAGE": true,
180180
"DCGM_FI_DEV_POWER_VIOLATION": true,
181181
"DCGM_FI_DEV_THERMAL_VIOLATION": true,
182-
"DCGM_FI_DEV_ECC_SBE_AGG_TOTAL": true,
182+
"DCGM_FI_DEV_ECC_DBE_AGG_TOTAL": true,
183183

184184
// GPU Utilization
185185
"amd_gpu_prof_gui_util_percent": true,

0 commit comments

Comments
 (0)