Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
k8s-app: cluster-version-operator
name: cluster-version-operator-tech-preview
namespace: openshift-cluster-version
annotations:
kubernetes.io/description: Alerting rules for when cluster-version operator metrics call for administrator attention.
exclude.release.openshift.io/internal-openshift-hosted: "true"
include.release.openshift.io/self-managed-high-availability: "true"
release.openshift.io/feature-set: TechPreviewNoUpgrade
spec:
groups:
- name: cluster-version-tech-preview
rules:
- alert: RiskApplies
annotations:
summary: The cluster has been exposed to the conditional update risk for 10 minutes.
description: The conditional update risk {{ "{{ $labels.risk }}" }} applies to the cluster, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/RiskApplies.md
expr: |
max by (namespace, name, risk) (cluster_version_risk_conditions{job="cluster-version-operator", name="version", condition="Applies"} == 1)
for: 10m
labels:
severity: warning
27 changes: 27 additions & 0 deletions pkg/cvo/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ type operatorMetrics struct {
capability *prometheus.GaugeVec
clusterOperatorUp *prometheus.GaugeVec
clusterOperatorConditions *prometheus.GaugeVec
clusterVersionRiskConditions *prometheus.GaugeVec
clusterOperatorConditionTransitions *prometheus.GaugeVec
clusterInstaller *prometheus.GaugeVec
clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec
Expand Down Expand Up @@ -108,6 +109,10 @@ penultimate completed version for 'completed'.
Name: "cluster_operator_conditions",
Help: "Report the conditions for active cluster operators. 0 is False and 1 is True.",
}, []string{"name", "condition", "reason"}),
clusterVersionRiskConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "cluster_version_risk_conditions",
Help: "Report the risk conditions for cluster versions. 0 is False and 1 is True.",
}, []string{"name", "condition", "risk"}),
clusterOperatorConditionTransitions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "cluster_operator_condition_transitions",
Help: "Reports the number of times that a condition on a cluster operator changes status",
Expand Down Expand Up @@ -436,6 +441,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
ch <- m.capability.WithLabelValues("").Desc()
ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc()
ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc()
ch <- m.clusterVersionRiskConditions.WithLabelValues("", "", "").Desc()
ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc()
Expand All @@ -457,6 +463,24 @@ func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric,
}
}

func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Metric, risks []configv1.ConditionalUpdateRisk) {
for _, risk := range risks {
for _, condition := range risk.Conditions {
if condition.Type != internal.ConditionalUpdateRiskConditionTypeApplies {
continue
}

g := m.clusterVersionRiskConditions.WithLabelValues("version", condition.Type, risk.Name)
if condition.Status == metav1.ConditionTrue {
g.Set(1)
} else {
g.Set(0)
}
ch <- g
}
}
}

// Collect collects metrics from the operator into the channel ch
func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
current := m.optr.currentVersion()
Expand Down Expand Up @@ -602,6 +626,9 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
}

m.collectConditionalUpdates(ch, cv.Status.ConditionalUpdates)
if m.optr.shouldReconcileAcceptRisks() {
m.collectConditionalUpdateRisks(ch, cv.Status.ConditionalUpdateRisks)
}
}

g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version)
Expand Down
112 changes: 112 additions & 0 deletions pkg/cvo/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"errors"
"fmt"
"github.com/openshift/cluster-version-operator/pkg/featuregates"
"io"
"net/http"
"net/http/httptest"
Expand Down Expand Up @@ -667,6 +668,7 @@ func Test_operatorMetrics_Collect(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tt.optr.enabledCVOFeatureGates = featuregates.DefaultCvoGates("version")
tt.optr.eventRecorder = record.NewFakeRecorder(100)
if tt.optr.cvLister == nil {
tt.optr.cvLister = &cvLister{}
Expand Down Expand Up @@ -973,6 +975,116 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) {
}
}

func Test_collectConditionalUpdateRisks(t *testing.T) {
type valueWithLabels struct {
value float64
labels map[string]string
}
testCases := []struct {
name string
risks []configv1.ConditionalUpdateRisk
expected []valueWithLabels
}{
{
name: "no conditional updates",
expected: []valueWithLabels{},
},
{
name: "unknown type",
risks: []configv1.ConditionalUpdateRisk{
{
Name: "RiskX",
Conditions: []metav1.Condition{{
Type: internal.ConditionalUpdateConditionTypeRecommended,
Status: metav1.ConditionFalse,
Reason: "ReasonA",
Message: "Risk does not apply",
}},
},
},
},
{
name: "apply false",
risks: []configv1.ConditionalUpdateRisk{
{
Name: "RiskX",
Conditions: []metav1.Condition{{
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
Status: metav1.ConditionFalse,
Reason: "ReasonA",
Message: "Risk does not apply",
}},
},
},
expected: []valueWithLabels{{
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
}},
},
{
name: "apply true",
risks: []configv1.ConditionalUpdateRisk{
{
Name: "RiskX",
Conditions: []metav1.Condition{{
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
Status: metav1.ConditionTrue,
Reason: "ReasonA",
Message: "Risk does not apply",
}},
},
},
expected: []valueWithLabels{{
value: 1,
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
}},
},
{
name: "apply unknown",
risks: []configv1.ConditionalUpdateRisk{
{
Name: "RiskX",
Conditions: []metav1.Condition{{
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
Status: metav1.ConditionUnknown,
Reason: "ReasonA",
Message: "Risk does not apply",
}},
},
},
expected: []valueWithLabels{{
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
}},
},
}

for _, tc := range testCases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
optr := &Operator{}
m := newOperatorMetrics(optr)
ch := make(chan prometheus.Metric)

go func() {
m.collectConditionalUpdateRisks(ch, tc.risks)
close(ch)
}()

var collected []prometheus.Metric
for item := range ch {
collected = append(collected, item)
}

if lenC, lenE := len(collected), len(tc.expected); lenC != lenE {

t.Fatalf("Expected %d metrics, got %d metrics\nGot metrics: %s", lenE, lenC, spew.Sdump(collected))
}
for i := range tc.expected {
expectMetric(t, collected[i], tc.expected[i].value, tc.expected[i].labels)
}
})
}
}

func expectMetric(t *testing.T, metric prometheus.Metric, value float64, labels map[string]string) {
t.Helper()
var d dto.Metric
Expand Down