diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0966e6e..b6a1cbe 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,8 +37,8 @@ jobs: exit 1 fi - - name: Run tests - run: go test -v -race -coverprofile=coverage.out ./... + - name: Run unit tests + run: go test -v -race -coverprofile=coverage.out ./internal/... - name: Upload coverage uses: codecov/codecov-action@v4 @@ -46,6 +46,23 @@ jobs: files: coverage.out fail_ci_if_error: false + e2e: + runs-on: ubuntu-latest + needs: build + steps: + - uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.21' + + - name: Build binary + run: go build -o bin/metrics-analyzer ./cmd/metrics-analyzer + + - name: Run E2E tests + run: go test -v ./cmd/... + validate-rules: runs-on: ubuntu-latest needs: build diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..3566f5e --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,34 @@ +# Changelog + +All notable changes to this project are documented in this file. + +## 0.0.5 + +- Split histogram `+Inf` overflow reporting by label-set so problematic series are listed individually. +- Added metric description (Prometheus HELP text) to overflow findings. +- Improved overflow wording and added on-the-fly unit guessing for histogram outputs. +- Improved release/docs workflow (release target, GitHub Pages docs, container registry update). + +## 0.0.4 + +- Added containerization and Helm chart support for deployment. +- Fixed Helm chart packaging/versioning issues for `0.0.4`. +- Continued rule quality improvements and review metadata propagation. + +## 0.0.3 + +- Introduced rule review metadata and broader rule/test polishing. +- Added/expanded tests for gauge-threshold rules. +- Improved readability of numeric output formatting in reports. + +## 0.0.2 + +- Added web server mode for serving analyzer output. + +## 0.0.1 + +- Initial usable release with the results of the AI hackathon, including: + - interactive TUI mode, + - demo assets, + - generic histogram rule support, + - first web-server iteration. diff --git a/VERSION b/VERSION index 81340c7..bbdeab6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.0.4 +0.0.5 diff --git a/cmd/metrics-analyzer/main_test.go b/cmd/metrics-analyzer/main_test.go index e611684..deb81f9 100644 --- a/cmd/metrics-analyzer/main_test.go +++ b/cmd/metrics-analyzer/main_test.go @@ -28,6 +28,7 @@ func TestE2EAnalyzeCommand(t *testing.T) { "analyze", "--rules", "../../testdata/fixtures", "--format", "markdown", + "--template", "../../templates/markdown.tmpl", "--output", "/tmp/test_e2e_report.md", "../../testdata/fixtures/sample_metrics.txt", }, @@ -209,9 +210,12 @@ func TestE2EFullWorkflow(t *testing.T) { metricsPath := filepath.Join("..", "..", "testdata", "fixtures", "sample_metrics.txt") rulesPath := filepath.Join("..", "..", "testdata", "fixtures") + templatePath := filepath.Join("..", "..", "templates", "markdown.tmpl") + cmd := exec.Command(absPath, "analyze", "--rules", rulesPath, "--format", "markdown", + "--template", templatePath, "--output", "/tmp/e2e_test_report.md", metricsPath, ) diff --git a/internal/evaluator/evaluator_common.go b/internal/evaluator/evaluator_common.go index 2147c77..af5e293 100644 --- a/internal/evaluator/evaluator_common.go +++ b/internal/evaluator/evaluator_common.go @@ -155,7 +155,7 @@ func EvaluateAllRules(rulesList []rules.Rule, metrics parser.MetricsData, loadLe } // Apply general histogram +Inf overflow rule to all histogram metrics - infOverflowResults := EvaluateHistogramInfOverflow(metrics, loadLevel) + infOverflowResults := EvaluateHistogramInfOverflow(metrics) for _, result := range infOverflowResults { report.Results = append(report.Results, result) diff --git a/internal/evaluator/evaluator_test.go b/internal/evaluator/evaluator_test.go index 5746c9b..e2b4f2b 100644 --- a/internal/evaluator/evaluator_test.go +++ b/internal/evaluator/evaluator_test.go @@ -406,138 +406,228 @@ func TestIsRuleApplicable(t *testing.T) { } func TestEvaluateHistogramInfOverflow(t *testing.T) { - tests := map[string]struct { - metrics parser.MetricsData - wantStatus map[string]rules.Status // metric name -> expected status - wantCount int // expected number of results - }{ - "should return red status when >50% in +Inf": { - metrics: parser.MetricsData{ - "test_histogram_bucket": &parser.Metric{ - Name: "test_histogram_bucket", - Type: "histogram", - Values: []parser.MetricValue{ - {Value: 10, Labels: map[string]string{"le": "0.1"}}, - {Value: 20, Labels: map[string]string{"le": "0.5"}}, - {Value: 30, Labels: map[string]string{"le": "1.0"}}, - {Value: 100, Labels: map[string]string{"le": "+Inf"}}, // 70 in +Inf (70%) - }, - }, - }, - wantStatus: map[string]rules.Status{ - "test_histogram (+Inf overflow check)": rules.StatusRed, - }, - wantCount: 1, - }, - "should return yellow status when >25% but <=50% in +Inf": { - metrics: parser.MetricsData{ - "test_histogram_bucket": &parser.Metric{ - Name: "test_histogram_bucket", - Type: "histogram", - Values: []parser.MetricValue{ - {Value: 10, Labels: map[string]string{"le": "0.1"}}, - {Value: 20, Labels: map[string]string{"le": "0.5"}}, - {Value: 30, Labels: map[string]string{"le": "1.0"}}, - {Value: 50, Labels: map[string]string{"le": "+Inf"}}, // 20 in +Inf (40%) - }, - }, - }, - wantStatus: map[string]rules.Status{ - "test_histogram (+Inf overflow check)": rules.StatusYellow, - }, - wantCount: 1, - }, - "should return green status when <=25% in +Inf": { - metrics: parser.MetricsData{ - "test_histogram_bucket": &parser.Metric{ - Name: "test_histogram_bucket", - Type: "histogram", - Values: []parser.MetricValue{ - {Value: 10, Labels: map[string]string{"le": "0.1"}}, - {Value: 20, Labels: map[string]string{"le": "0.5"}}, - {Value: 30, Labels: map[string]string{"le": "1.0"}}, - {Value: 35, Labels: map[string]string{"le": "+Inf"}}, // 5 in +Inf (14.3%) - }, + t.Run("splits red and yellow by label set with label-aware titles", func(t *testing.T) { + metrics := parser.MetricsData{ + "test_histogram": &parser.Metric{ + Name: "test_histogram", + Help: "Histogram help from base metric", + }, + "test_histogram_bucket": &parser.Metric{ + Name: "test_histogram_bucket", + Type: "histogram", + Values: []parser.MetricValue{ + {Value: 10, Labels: map[string]string{"component": "alpha", "le": "1.0"}}, + {Value: 100, Labels: map[string]string{"component": "alpha", "le": "+Inf"}}, // red (90%) + {Value: 70, Labels: map[string]string{"component": "beta", "le": "1.0"}}, + {Value: 100, Labels: map[string]string{"component": "beta", "le": "+Inf"}}, // yellow (30%) + {Value: 95, Labels: map[string]string{"component": "gamma", "le": "1.0"}}, + {Value: 100, Labels: map[string]string{"component": "gamma", "le": "+Inf"}}, // green (5%) + }, + }, + } + + results := EvaluateHistogramInfOverflow(metrics) + if len(results) != 2 { + t.Fatalf("expected 2 problematic series results, got %d", len(results)) + } + + gotStatus := map[string]rules.Status{} + for _, result := range results { + gotStatus[result.RuleName] = result.Status + if result.MetricHelp != "Histogram help from base metric" { + t.Errorf("unexpected MetricHelp: %q", result.MetricHelp) + } + if result.Message == "" { + t.Errorf("message should not be empty for %s", result.RuleName) + } + if !strings.Contains(result.Message, "Highest non-infinity bucket") { + t.Errorf("message should contain highest finite bucket details for %s", result.RuleName) + } + } + + if gotStatus[`test_histogram{component="alpha"} (+Inf overflow check)`] != rules.StatusRed { + t.Errorf("expected alpha series to be red, got %v", gotStatus[`test_histogram{component="alpha"} (+Inf overflow check)`]) + } + if gotStatus[`test_histogram{component="beta"} (+Inf overflow check)`] != rules.StatusYellow { + t.Errorf("expected beta series to be yellow, got %v", gotStatus[`test_histogram{component="beta"} (+Inf overflow check)`]) + } + if _, exists := gotStatus["test_histogram (+Inf overflow check)"]; exists { + t.Error("did not expect aggregated green summary when problematic series exist") + } + }) + + t.Run("emits single green summary when all series are green", func(t *testing.T) { + metrics := parser.MetricsData{ + "test_histogram": &parser.Metric{ + Name: "test_histogram", + Help: "Green summary help", + }, + "test_histogram_bucket": &parser.Metric{ + Name: "test_histogram_bucket", + Type: "histogram", + Values: []parser.MetricValue{ + {Value: 95, Labels: map[string]string{"component": "alpha", "le": "1.0"}}, + {Value: 100, Labels: map[string]string{"component": "alpha", "le": "+Inf"}}, + {Value: 92, Labels: map[string]string{"component": "beta", "le": "1.0"}}, + {Value: 100, Labels: map[string]string{"component": "beta", "le": "+Inf"}}, + }, + }, + } + + results := EvaluateHistogramInfOverflow(metrics) + if len(results) != 1 { + t.Fatalf("expected 1 green summary result, got %d", len(results)) + } + result := results[0] + if result.RuleName != "test_histogram (+Inf overflow check)" { + t.Errorf("unexpected green summary rule name: %q", result.RuleName) + } + if result.Status != rules.StatusGreen { + t.Errorf("expected green summary status, got %v", result.Status) + } + if result.MetricHelp != "Green summary help" { + t.Errorf("unexpected MetricHelp on green summary: %q", result.MetricHelp) + } + }) + + t.Run("falls back to bucket help when base help missing", func(t *testing.T) { + metrics := parser.MetricsData{ + "test_histogram_bucket": &parser.Metric{ + Name: "test_histogram_bucket", + Help: "Bucket help fallback text", + Type: "histogram", + Values: []parser.MetricValue{ + {Value: 10, Labels: map[string]string{"component": "alpha", "le": "1.0"}}, + {Value: 100, Labels: map[string]string{"component": "alpha", "le": "+Inf"}}, + }, + }, + } + + results := EvaluateHistogramInfOverflow(metrics) + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + if results[0].MetricHelp != "Bucket help fallback text" { + t.Errorf("expected fallback MetricHelp, got %q", results[0].MetricHelp) + } + }) + + t.Run("handles label-free histogram with only le buckets", func(t *testing.T) { + metrics := parser.MetricsData{ + "test_histogram_bucket": &parser.Metric{ + Name: "test_histogram_bucket", + Type: "histogram", + Values: []parser.MetricValue{ + {Value: 10, Labels: map[string]string{"le": "1.0"}}, + {Value: 100, Labels: map[string]string{"le": "+Inf"}}, }, }, - wantStatus: map[string]rules.Status{ - "test_histogram (+Inf overflow check)": rules.StatusGreen, - }, - wantCount: 1, - }, - "should skip histogram without +Inf bucket": { - metrics: parser.MetricsData{ - "test_histogram_bucket": &parser.Metric{ - Name: "test_histogram_bucket", - Type: "histogram", - Values: []parser.MetricValue{ - {Value: 10, Labels: map[string]string{"le": "0.1"}}, - {Value: 20, Labels: map[string]string{"le": "0.5"}}, - }, + } + + results := EvaluateHistogramInfOverflow(metrics) + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + result := results[0] + if result.RuleName != "test_histogram (+Inf overflow check)" { + t.Errorf("expected label-free rule name, got %q", result.RuleName) + } + if result.Status != rules.StatusRed { + t.Errorf("expected red status (90%% overflow), got %v", result.Status) + } + }) + + t.Run("skips histogram without +Inf bucket", func(t *testing.T) { + metrics := parser.MetricsData{ + "test_histogram_bucket": &parser.Metric{ + Name: "test_histogram_bucket", + Type: "histogram", + Values: []parser.MetricValue{ + {Value: 10, Labels: map[string]string{"le": "0.1"}}, + {Value: 20, Labels: map[string]string{"le": "0.5"}}, }, }, - wantStatus: map[string]rules.Status{}, - wantCount: 0, - }, - "should handle multiple histograms": { - metrics: parser.MetricsData{ - "hist1_bucket": &parser.Metric{ - Name: "hist1_bucket", - Type: "histogram", - Values: []parser.MetricValue{ - {Value: 10, Labels: map[string]string{"le": "1.0"}}, - {Value: 100, Labels: map[string]string{"le": "+Inf"}}, // 90 in +Inf (90%) - }, - }, - "hist2_bucket": &parser.Metric{ - Name: "hist2_bucket", - Type: "histogram", - Values: []parser.MetricValue{ - {Value: 10, Labels: map[string]string{"le": "1.0"}}, - {Value: 15, Labels: map[string]string{"le": "+Inf"}}, // 5 in +Inf (33%) - }, + } + + results := EvaluateHistogramInfOverflow(metrics) + if len(results) != 0 { + t.Fatalf("expected 0 results, got %d", len(results)) + } + }) + + t.Run("guesses unit from metric name for +Inf output", func(t *testing.T) { + metrics := parser.MetricsData{ + "test_duration_seconds_bucket": &parser.Metric{ + Name: "test_duration_seconds_bucket", + Type: "histogram", + Values: []parser.MetricValue{ + {Value: 10, Labels: map[string]string{"le": "512"}}, + {Value: 100, Labels: map[string]string{"le": "+Inf"}}, }, }, - wantStatus: map[string]rules.Status{ - "hist1 (+Inf overflow check)": rules.StatusRed, - "hist2 (+Inf overflow check)": rules.StatusYellow, - }, - wantCount: 2, - }, - } + } - for name, tt := range tests { - t.Run(name, func(t *testing.T) { - results := EvaluateHistogramInfOverflow(tt.metrics, rules.LoadLevelMedium) + results := EvaluateHistogramInfOverflow(metrics) + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + if !strings.Contains(results[0].Message, "Highest non-infinity bucket: 512 s") { + t.Fatalf("expected guessed seconds unit in message, got: %s", results[0].Message) + } + details := strings.Join(results[0].Details, "\n") + if !strings.Contains(details, "Highest non-infinity bucket: 512 s") { + t.Fatalf("expected guessed seconds unit in details, got: %s", details) + } + }) - if len(results) != tt.wantCount { - t.Errorf("EvaluateHistogramInfOverflow() returned %d results, want %d", len(results), tt.wantCount) - } + t.Run("uses help text only when unit is unambiguous", func(t *testing.T) { + metrics := parser.MetricsData{ + "mystery_metric": &parser.Metric{ + Name: "mystery_metric", + Help: "Time taken in milliseconds for processing", + Type: "histogram", + }, + "mystery_metric_bucket": &parser.Metric{ + Name: "mystery_metric_bucket", + Type: "histogram", + Values: []parser.MetricValue{ + {Value: 10, Labels: map[string]string{"le": "512"}}, + {Value: 100, Labels: map[string]string{"le": "+Inf"}}, + }, + }, + } + results := EvaluateHistogramInfOverflow(metrics) + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + if !strings.Contains(results[0].Message, "Highest non-infinity bucket: 512 ms") { + t.Fatalf("expected guessed milliseconds unit in message, got: %s", results[0].Message) + } + }) - for _, result := range results { - wantStatus, exists := tt.wantStatus[result.RuleName] - if !exists { - t.Errorf("Unexpected result for metric %s", result.RuleName) - continue - } - - if result.Status != wantStatus { - t.Errorf("EvaluateHistogramInfOverflow() for %s = %v, want %v", result.RuleName, result.Status, wantStatus) - } - - // Verify message contains expected information - if result.Status != rules.StatusGreen { - if result.Message == "" { - t.Error("Message should not be empty for non-green status") - } - if !strings.Contains(result.Message, "Highest non-infinity bucket") { - t.Error("Message should contain 'Highest non-infinity bucket'") - } - if !strings.Contains(result.Message, "didn't expect") { - t.Error("Message should contain explanation about designer expectations") - } - } - } - }) - } + t.Run("does not use help text when multiple units are present", func(t *testing.T) { + metrics := parser.MetricsData{ + "mystery_metric": &parser.Metric{ + Name: "mystery_metric", + Help: "Latency shown in milliseconds and seconds for compatibility", + Type: "histogram", + }, + "mystery_metric_bucket": &parser.Metric{ + Name: "mystery_metric_bucket", + Type: "histogram", + Values: []parser.MetricValue{ + {Value: 10, Labels: map[string]string{"le": "512"}}, + {Value: 100, Labels: map[string]string{"le": "+Inf"}}, + }, + }, + } + results := EvaluateHistogramInfOverflow(metrics) + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + if strings.Contains(results[0].Message, "Highest non-infinity bucket: 512 ms") || + strings.Contains(results[0].Message, "Highest non-infinity bucket: 512 s") { + t.Fatalf("expected no guessed unit in ambiguous help text, got: %s", results[0].Message) + } + }) } diff --git a/internal/evaluator/histogram.go b/internal/evaluator/histogram.go index d43f637..430b348 100644 --- a/internal/evaluator/histogram.go +++ b/internal/evaluator/histogram.go @@ -12,6 +12,11 @@ import ( "github.com/stackrox/sensor-metrics-analyzer/internal/rules" ) +const ( + infOverflowYellowThresholdPercent = 25.0 + infOverflowRedThresholdPercent = 50.0 +) + // EvaluateHistogram evaluates a histogram rule func EvaluateHistogram(rule rules.Rule, metrics parser.MetricsData, loadLevel rules.LoadLevel) rules.EvaluationResult { result := rules.EvaluationResult{ @@ -115,60 +120,60 @@ func EvaluateHistogram(rule rules.Rule, metrics parser.MetricsData, loadLevel ru // EvaluateHistogramInfOverflow evaluates all histogram metrics for +Inf bucket overflow // This is a general rule that applies to any histogram metric -func EvaluateHistogramInfOverflow(metrics parser.MetricsData, loadLevel rules.LoadLevel) []rules.EvaluationResult { +func EvaluateHistogramInfOverflow(metrics parser.MetricsData) []rules.EvaluationResult { var results []rules.EvaluationResult // Get all histogram base names histogramBases := metrics.GetHistogramBaseNames() for _, baseName := range histogramBases { - result := evaluateSingleHistogramInfOverflow(baseName, metrics) - if result != nil { - results = append(results, *result) - } + results = append(results, evaluateSingleHistogramInfOverflow(baseName, metrics)...) } return results } // evaluateSingleHistogramInfOverflow evaluates a single histogram metric for +Inf overflow -// Handles multiple label combinations (series) by evaluating each separately and reporting the worst case -func evaluateSingleHistogramInfOverflow(baseName string, metrics parser.MetricsData) *rules.EvaluationResult { - result := &rules.EvaluationResult{ - RuleName: baseName + " (+Inf overflow check)", - Status: rules.StatusGreen, - Details: []string{}, - Timestamp: time.Now(), - } - result.ReviewStatus = "Automatically generated rule; reviewed by the code author at the time of implementation." - +// Handles multiple label combinations (series) by emitting one result per problematic series. +func evaluateSingleHistogramInfOverflow(baseName string, metrics parser.MetricsData) []rules.EvaluationResult { // Get histogram buckets bucketMetricName := baseName + "_bucket" bucketMetric, exists := metrics.GetMetric(bucketMetricName) if !exists || len(bucketMetric.Values) == 0 { - return nil // Skip if no buckets found + return nil } + metricHelp := resolveMetricHelp(baseName, metrics) + guessedUnit := guessMetricUnit(baseName, metricHelp) // Group buckets by label combination (excluding "le" label) // Each label combination represents a separate time series seriesBuckets := make(map[string][]parser.MetricValue) + seriesLabels := make(map[string]map[string]string) for _, v := range bucketMetric.Values { // Create a key from all labels except "le" seriesKey := getSeriesKey(v.Labels) seriesBuckets[seriesKey] = append(seriesBuckets[seriesKey], v) + if _, seen := seriesLabels[seriesKey]; !seen { + seriesLabels[seriesKey] = extractSeriesLabels(v.Labels) + } } - // Track worst case across all series - var worstInfPercentage float64 - var worstInfObservations float64 - var worstTotalCount float64 - var worstHighestFiniteLe float64 - var worstStatus rules.Status + type seriesEvaluation struct { + labels map[string]string + infPercentage float64 + infObservations float64 + totalCount float64 + highestFiniteLe float64 + status rules.Status + } + var evaluations []seriesEvaluation hasAnyData := false + var worstOverall *seriesEvaluation + var worstSeriesKey string // Evaluate each series separately - for _, buckets := range seriesBuckets { + for seriesKey, buckets := range seriesBuckets { // Find +Inf bucket and highest finite bucket for this series var infCount float64 var highestFiniteLe float64 @@ -206,57 +211,95 @@ func evaluateSingleHistogramInfOverflow(baseName string, metrics parser.MetricsD } infPercentage := (infObservations / infCount) * 100.0 - // Track worst case - if infPercentage > worstInfPercentage { - worstInfPercentage = infPercentage - worstInfObservations = infObservations - worstTotalCount = infCount - worstHighestFiniteLe = highestFiniteLe - - // Determine status for this series - if infPercentage > 50.0 { - worstStatus = rules.StatusRed - } else if infPercentage > 25.0 { - worstStatus = rules.StatusYellow - } else { - worstStatus = rules.StatusGreen - } + status := rules.StatusGreen + if infPercentage > infOverflowRedThresholdPercent { + status = rules.StatusRed + } else if infPercentage > infOverflowYellowThresholdPercent { + status = rules.StatusYellow } - } - if !hasAnyData { - return nil + eval := seriesEvaluation{ + labels: seriesLabels[seriesKey], + infPercentage: infPercentage, + infObservations: infObservations, + totalCount: infCount, + highestFiniteLe: highestFiniteLe, + status: status, + } + evaluations = append(evaluations, eval) + + if worstOverall == nil || infPercentage > worstOverall.infPercentage || + (infPercentage == worstOverall.infPercentage && seriesKey < worstSeriesKey) { + worstCopy := eval + worstOverall = &worstCopy + worstSeriesKey = seriesKey + } } - result.Status = worstStatus - if baseMetric, ok := metrics.GetMetric(baseName); ok && baseMetric.Help != "" { - result.Details = append(result.Details, "Metric Description: "+baseMetric.Help) - } else if bucketMetric, ok := metrics.GetMetric(baseName + "_bucket"); ok && bucketMetric.Help != "" { - result.Details = append(result.Details, "Metric Description: "+bucketMetric.Help) + if !hasAnyData || len(evaluations) == 0 { + return nil } - result.Details = append(result.Details, - "Total Number of Observations: "+formatHumanNumber(worstTotalCount), - "Observations in +Inf bucket: "+formatHumanNumber(worstInfObservations), - "Percentage of observations in +Inf bucket: "+formatHumanNumber(worstInfPercentage)+" %", - "Highest non-infinity bucket: "+formatHumanNumber(worstHighestFiniteLe)+" unit", - ) - // Build message based on worst case - result.Message = fmt.Sprintf("%s%% of observations in +Inf bucket (acceptable). Highest non-infinity bucket: %s", - formatHumanNumber(worstInfPercentage), formatHumanNumber(worstHighestFiniteLe)) - if worstInfPercentage > 25.0 { + var results []rules.EvaluationResult + for _, eval := range evaluations { + if eval.status == rules.StatusGreen { + continue + } + result := rules.EvaluationResult{ + RuleName: formatSeriesRuleName(baseName, eval.labels), + Status: eval.status, + MetricHelp: metricHelp, + Details: []string{}, + Timestamp: time.Now(), + ReviewStatus: "Automatically generated rule; reviewed by the code author at the time of implementation.", + PotentialActionUser: fmt.Sprintf("Further investigation is required to understand why values exceed %s. "+ + "Check if there are other alerts for this specific metric with more precise context.", formatHistogramValue(eval.highestFiniteLe, guessedUnit)), + PotentialActionDeveloper: "Review code paths and metric instrumentation to confirm whether observed latencies are expected.", + } + result.Details = append(result.Details, + "Total Number of Observations: "+formatHumanNumber(eval.totalCount), + "Observations in +Inf bucket: "+formatHumanNumber(eval.infObservations), + "Percentage of observations in +Inf bucket: "+formatHumanNumber(eval.infPercentage)+" %", + "Highest non-infinity bucket: "+formatHistogramValue(eval.highestFiniteLe, guessedUnit), + ) result.Message = fmt.Sprintf("%s%% of observations are in +Inf bucket (%s out of %s). "+ - "This indicates the metric designer likely didn't expect processing durations to be so high. "+ + "This indicates the metric designer likely didn't expect the values to be so high. "+ + "This may indicate a problem in the system or a problem with metrics design. "+ "Highest non-infinity bucket: %s", - formatHumanNumber(worstInfPercentage), - formatHumanNumber(worstInfObservations), - formatHumanNumber(worstTotalCount), - formatHumanNumber(worstHighestFiniteLe)) - result.PotentialActionUser = fmt.Sprintf("Further investigation is required to understand why values exceed %s. "+ - "Check if there are other alerts for this specific metric with more precise context.", formatHumanNumber(worstHighestFiniteLe)) - result.PotentialActionDeveloper = "Review code paths and metric instrumentation to confirm whether observed latencies are expected." + formatHumanNumber(eval.infPercentage), + formatHumanNumber(eval.infObservations), + formatHumanNumber(eval.totalCount), + formatHistogramValue(eval.highestFiniteLe, guessedUnit)) + results = append(results, result) } - return result + + if len(results) > 0 { + sort.Slice(results, func(i, j int) bool { + if results[i].Status != results[j].Status { + return results[i].Status == rules.StatusRed + } + return results[i].RuleName < results[j].RuleName + }) + return results + } + + greenResult := rules.EvaluationResult{ + RuleName: baseName + " (+Inf overflow check)", + Status: rules.StatusGreen, + MetricHelp: metricHelp, + Details: []string{}, + Timestamp: time.Now(), + ReviewStatus: "Automatically generated rule; reviewed by the code author at the time of implementation.", + } + greenResult.Details = append(greenResult.Details, + "Total Number of Observations: "+formatHumanNumber(worstOverall.totalCount), + "Observations in +Inf bucket: "+formatHumanNumber(worstOverall.infObservations), + "Percentage of observations in +Inf bucket: "+formatHumanNumber(worstOverall.infPercentage)+" %", + "Highest non-infinity bucket: "+formatHistogramValue(worstOverall.highestFiniteLe, guessedUnit), + ) + greenResult.Message = fmt.Sprintf("%s%% of observations in +Inf bucket (acceptable). Highest non-infinity bucket: %s", + formatHumanNumber(worstOverall.infPercentage), formatHistogramValue(worstOverall.highestFiniteLe, guessedUnit)) + return []rules.EvaluationResult{greenResult} } // getSeriesKey creates a key from labels excluding "le" to group buckets by series @@ -271,6 +314,43 @@ func getSeriesKey(labels map[string]string) string { return strings.Join(keys, ",") } +func extractSeriesLabels(labels map[string]string) map[string]string { + result := make(map[string]string) + for key, value := range labels { + if key == "le" { + continue + } + result[key] = value + } + return result +} + +func formatSeriesRuleName(baseName string, labels map[string]string) string { + if len(labels) == 0 { + return baseName + " (+Inf overflow check)" + } + var keys []string + for key := range labels { + keys = append(keys, key) + } + sort.Strings(keys) + var parts []string + for _, key := range keys { + parts = append(parts, key+"=\""+labels[key]+"\"") + } + return baseName + "{" + strings.Join(parts, ",") + "} (+Inf overflow check)" +} + +func resolveMetricHelp(baseName string, metrics parser.MetricsData) string { + if baseMetric, ok := metrics.GetMetric(baseName); ok && baseMetric.Help != "" { + return baseMetric.Help + } + if bucketMetric, ok := metrics.GetMetric(baseName + "_bucket"); ok && bucketMetric.Help != "" { + return bucketMetric.Help + } + return "" +} + func formatHumanNumber(value float64) string { return formatHumanNumberWithPrecision(value, 2) } diff --git a/internal/evaluator/unit_guess.go b/internal/evaluator/unit_guess.go new file mode 100644 index 0000000..0b036b0 --- /dev/null +++ b/internal/evaluator/unit_guess.go @@ -0,0 +1,92 @@ +package evaluator + +import ( + "regexp" + "strings" +) + +var ( + unitTokenToCanonical = map[string]string{ + "seconds": "seconds", + "second": "seconds", + "s": "seconds", + "sec": "seconds", + "secs": "seconds", + "milliseconds": "milliseconds", + "millisecond": "milliseconds", + "ms": "milliseconds", + "msec": "milliseconds", + "msecs": "milliseconds", + "millis": "milliseconds", + "bytes": "bytes", + "byte": "bytes", + } + + helpUnitMatchers = map[string]*regexp.Regexp{ + "milliseconds": regexp.MustCompile(`(?i)\bmilliseconds?\b|\bms\b|\bmsecs?\b|\bmillis\b`), + "seconds": regexp.MustCompile(`(?i)\bseconds?\b|\bsecs?\b|\bsec\b`), + "bytes": regexp.MustCompile(`(?i)\bbytes?\b`), + } +) + +// guessMetricUnit infers unit from metric name first, then HELP text. +// HELP text is only used when exactly one unit candidate is detected. +func guessMetricUnit(metricName, helpText string) string { + if unit := guessUnitFromMetricName(metricName); unit != "" { + return unit + } + return guessUnitFromHelpText(helpText) +} + +func guessUnitFromMetricName(metricName string) string { + metricName = strings.TrimSpace(strings.ToLower(metricName)) + if metricName == "" { + return "" + } + + parts := strings.Split(metricName, "_") + if len(parts) == 0 { + return "" + } + + // Prometheus best-practice suffixes like *_seconds_total. + if len(parts) >= 2 && parts[len(parts)-1] == "total" { + if unit, ok := unitTokenToCanonical[parts[len(parts)-2]]; ok { + return unit + } + } + + // Common suffixes like *_seconds and *_bytes. + if unit, ok := unitTokenToCanonical[parts[len(parts)-1]]; ok { + return unit + } + + // Handle *_timestamp_seconds. + if len(parts) >= 2 && parts[len(parts)-2] == "timestamp" { + if unit, ok := unitTokenToCanonical[parts[len(parts)-1]]; ok { + return unit + } + } + + return "" +} + +func guessUnitFromHelpText(helpText string) string { + helpText = strings.TrimSpace(helpText) + if helpText == "" { + return "" + } + + var matched []string + for unit, matcher := range helpUnitMatchers { + if matcher.MatchString(helpText) { + matched = append(matched, unit) + } + } + + // Use HELP only if it points to exactly one unit. + if len(matched) != 1 { + return "" + } + return matched[0] +} diff --git a/internal/evaluator/unit_guess_test.go b/internal/evaluator/unit_guess_test.go new file mode 100644 index 0000000..bf18940 --- /dev/null +++ b/internal/evaluator/unit_guess_test.go @@ -0,0 +1,112 @@ +package evaluator + +import "testing" + +func TestGuessUnitFromMetricName(t *testing.T) { + tests := []struct { + name string + metricName string + want string + }{ + {name: "seconds suffix", metricName: "http_request_duration_seconds", want: "seconds"}, + {name: "bytes suffix", metricName: "container_memory_usage_bytes", want: "bytes"}, + {name: "seconds total suffix", metricName: "process_cpu_seconds_total", want: "seconds"}, + {name: "timestamp seconds suffix", metricName: "last_seen_timestamp_seconds", want: "seconds"}, + {name: "real metric histogram bytes", metricName: "http_incoming_request_size_histogram_bytes", want: "bytes"}, + {name: "real metric bytes total", metricName: "go_memstats_alloc_bytes_total", want: "bytes"}, + {name: "real metric duration milliseconds", metricName: "rox_sensor_scan_call_duration_milliseconds", want: "milliseconds"}, + {name: "real metric purger duration seconds", metricName: "rox_sensor_network_flow_manager_purger_duration_seconds", want: "seconds"}, + {name: "real metric vm report duration milliseconds", metricName: "rox_sensor_virtual_machine_index_report_processing_duration_milliseconds", want: "milliseconds"}, + {name: "sec alias suffix", metricName: "request_duration_sec", want: "seconds"}, + {name: "msec alias suffix", metricName: "request_duration_msec", want: "milliseconds"}, + {name: "no known suffix", metricName: "rox_sensor_events", want: ""}, + {name: "empty metric name", metricName: "", want: ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := guessUnitFromMetricName(tt.metricName) + if got != tt.want { + t.Fatalf("guessUnitFromMetricName(%q) = %q, want %q", tt.metricName, got, tt.want) + } + }) + } +} + +func TestGuessUnitFromHelpText(t *testing.T) { + tests := []struct { + name string + helpText string + want string + }{ + {name: "milliseconds mention", helpText: "Time taken in milliseconds to process events", want: "milliseconds"}, + {name: "seconds mention", helpText: "Duration in seconds", want: "seconds"}, + {name: "seconds short sec mention", helpText: "Duration in sec", want: "seconds"}, + {name: "bytes mention", helpText: "Payload size in bytes", want: "bytes"}, + {name: "milliseconds alias msec", helpText: "Call took 32 msec", want: "milliseconds"}, + {name: "milliseconds alias millis", helpText: "Observed latency in millis", want: "milliseconds"}, + {name: "byte singular mention", helpText: "Equals to /memory/classes/total:byte.", want: "bytes"}, + {name: "ambiguous mentions", helpText: "Latency in milliseconds and seconds", want: ""}, + {name: "ambiguous time and bytes mentions", helpText: "Duration in seconds with payload bytes", want: ""}, + {name: "no known units", helpText: "Number of retries", want: ""}, + {name: "empty help text", helpText: "", want: ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := guessUnitFromHelpText(tt.helpText) + if got != tt.want { + t.Fatalf("guessUnitFromHelpText(%q) = %q, want %q", tt.helpText, got, tt.want) + } + }) + } +} + +func TestGuessMetricUnit(t *testing.T) { + tests := []struct { + name string + metricName string + helpText string + want string + }{ + { + name: "metric name wins over help text", + metricName: "process_cpu_seconds_total", + helpText: "CPU usage in milliseconds", + want: "seconds", + }, + { + name: "falls back to help text when name has no unit", + metricName: "rox_central_event_processing", + helpText: "Time taken in milliseconds", + want: "milliseconds", + }, + { + name: "real cluster duration without unit in name uses help", + metricName: "rox_sensor_k8s_event_processing_duration", + helpText: "Time spent fully processing an event from Kubernetes in milliseconds", + want: "milliseconds", + }, + { + name: "returns empty when both sources fail", + metricName: "rox_sensor_events", + helpText: "Count of events", + want: "", + }, + { + name: "real metric without unit in name or help", + metricName: "rox_sensor_k8s_event_processing_duration", + helpText: "Time taken to fully process an event from Kubernetes", + want: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := guessMetricUnit(tt.metricName, tt.helpText) + if got != tt.want { + t.Fatalf("guessMetricUnit(%q, %q) = %q, want %q", tt.metricName, tt.helpText, got, tt.want) + } + }) + } +} diff --git a/internal/reporter/console.go b/internal/reporter/console.go index 088c038..7cfc3e8 100644 --- a/internal/reporter/console.go +++ b/internal/reporter/console.go @@ -64,6 +64,9 @@ func GenerateConsole(report rules.AnalysisReport) string { for _, r := range redResults { result.WriteString(color.New(color.Bold).Sprintf("%s\n", r.RuleName)) result.WriteString(color.RedString(" Status: RED\n")) + if r.MetricHelp != "" { + result.WriteString(fmt.Sprintf(" Metric description: %s\n", r.MetricHelp)) + } result.WriteString(fmt.Sprintf(" Message: %s\n", r.Message)) if r.ReviewStatus != "" { result.WriteString(fmt.Sprintf(" Review: %s\n", r.ReviewStatus)) @@ -95,6 +98,9 @@ func GenerateConsole(report rules.AnalysisReport) string { for _, r := range yellowResults { result.WriteString(color.New(color.Bold).Sprintf("%s\n", r.RuleName)) result.WriteString(color.YellowString(" Status: YELLOW\n")) + if r.MetricHelp != "" { + result.WriteString(fmt.Sprintf(" Metric description: %s\n", r.MetricHelp)) + } result.WriteString(fmt.Sprintf(" Message: %s\n", r.Message)) if r.ReviewStatus != "" { result.WriteString(fmt.Sprintf(" Review: %s\n", r.ReviewStatus)) diff --git a/internal/rules/types.go b/internal/rules/types.go index e6a45e7..109c2dd 100644 --- a/internal/rules/types.go +++ b/internal/rules/types.go @@ -196,6 +196,7 @@ type LoadDetectionThreshold struct { type EvaluationResult struct { RuleName string Status Status + MetricHelp string Message string Value float64 Details []string diff --git a/internal/tui/view.go b/internal/tui/view.go index fbc1ece..014b8f0 100644 --- a/internal/tui/view.go +++ b/internal/tui/view.go @@ -144,19 +144,24 @@ func (m Model) renderResultsList() string { for i := start; i < end; i++ { r := m.filteredResults[i] - // Truncate name if too long + nameMax := 35 + msgMax := 40 + if m.width > 90 { + nameMax = m.width*55/100 - 3 + msgMax = m.width - nameMax - 5 + } + name := r.RuleName - if len(name) > 35 { - name = name[:32] + "..." + if len(name) > nameMax { + name = name[:nameMax-3] + "..." } - // Truncate message if too long msg := r.Message - if len(msg) > 40 { - msg = msg[:37] + "..." + if len(msg) > msgMax { + msg = msg[:msgMax-3] + "..." } - line := fmt.Sprintf("%s %-35s %s", StatusEmoji(string(r.Status)), name, msg) + line := fmt.Sprintf("%s %-*s %s", StatusEmoji(string(r.Status)), nameMax, name, msg) if i == m.cursor { lines = append(lines, selectedItemStyle.Render(line)) @@ -210,9 +215,6 @@ func (m Model) viewDetail() string { } detail.WriteString("\n\n") - // Message (wrapped to screen width) - detail.WriteString(detailLabelStyle.Render("Message:")) - detail.WriteString("\n") messageWidth := 70 if m.width > 0 { messageWidth = m.width - 10 @@ -220,6 +222,21 @@ func (m Model) viewDetail() string { messageWidth = 40 } } + + // Metric description + if result.MetricHelp != "" { + detail.WriteString(detailLabelStyle.Render("Metric description:")) + detail.WriteString("\n") + wrappedDescription := wordWrap(result.MetricHelp, messageWidth) + for _, line := range strings.Split(wrappedDescription, "\n") { + detail.WriteString(fmt.Sprintf(" %s\n", line)) + } + detail.WriteString("\n") + } + + // Message (wrapped to screen width) + detail.WriteString(detailLabelStyle.Render("Message:")) + detail.WriteString("\n") wrappedMessage := wordWrap(result.Message, messageWidth) for _, line := range strings.Split(wrappedMessage, "\n") { detail.WriteString(fmt.Sprintf(" %s\n", line)) diff --git a/templates/markdown.tmpl b/templates/markdown.tmpl index ac0ee82..390df78 100644 --- a/templates/markdown.tmpl +++ b/templates/markdown.tmpl @@ -21,6 +21,10 @@ {{ end }} ### 🔴 {{ $r.RuleName }} +{{ if $r.MetricHelp }} +#### Metric description +{{ $r.MetricHelp }} +{{ end }} #### Message {{ $r.Message }} {{ if $r.ReviewStatus }} @@ -58,6 +62,10 @@ This alert was generated by evaluating a rule. That rule was reviewed: {{ end }} ### 🟡 {{ $r.RuleName }} +{{ if $r.MetricHelp }} +#### Metric description +{{ $r.MetricHelp }} +{{ end }} #### Message {{ $r.Message }} {{ if $r.ReviewStatus }}