From 0c0049521740924aec94c6162f9b4bf444cf6024 Mon Sep 17 00:00:00 2001 From: Itay Tsabary Date: Sun, 8 Feb 2026 17:14:22 +0200 Subject: [PATCH] apollo_dashboard: remove duplicated per-env dashboard files --- Cargo.lock | 2 - crates/apollo_dashboard/Cargo.toml | 2 - ...s_testnet.json => dev_grafana_alerts.json} | 48 +- .../resources/dev_grafana_alerts_mainnet.json | 1712 ----------------- .../apollo_dashboard/src/alert_definitions.rs | 27 +- .../alert_scenarios/block_production_delay.rs | 68 +- .../alert_scenarios/block_production_halt.rs | 63 +- .../src/alert_scenarios/infra_alerts.rs | 7 - .../src/alert_scenarios/l1_gas_prices.rs | 52 +- .../src/alert_scenarios/l1_handlers.rs | 18 +- .../src/alert_scenarios/mempool_size.rs | 35 +- .../src/alert_scenarios/preconfirmed.rs | 18 +- .../src/alert_scenarios/sync_halt.rs | 22 +- .../src/alert_scenarios/tps.rs | 19 +- .../src/alert_scenarios/transaction_delays.rs | 86 +- .../alert_scenarios/transaction_failures.rs | 38 +- crates/apollo_dashboard/src/alerts.rs | 49 +- .../src/bin/sequencer_dashboard_generator.rs | 12 +- .../src/dashboard_definitions_test.rs | 13 +- crates/apollo_dashboard/src/dashboard_test.rs | 2 - deployments/local-testing/deploy.sh | 23 +- .../monitoring/src/builders/alert_builder.py | 22 +- .../src/builders/dashboard_builder.py | 4 +- deployments/monitoring/src/common/cli.py | 8 - deployments/monitoring/src/common/env.py | 3 +- 25 files changed, 92 insertions(+), 2261 deletions(-) rename crates/apollo_dashboard/resources/{dev_grafana_alerts_testnet.json => dev_grafana_alerts.json} (99%) delete mode 100644 crates/apollo_dashboard/resources/dev_grafana_alerts_mainnet.json diff --git a/Cargo.lock b/Cargo.lock index c81c1d19fca..fe28a5cc4a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1465,8 +1465,6 @@ dependencies = [ "serde", "serde_json", "serde_with", - "strum 0.25.0", - "strum_macros 0.25.3", ] [[package]] diff --git a/crates/apollo_dashboard/Cargo.toml b/crates/apollo_dashboard/Cargo.toml index d50bbecccb1..611a899b55c 100644 --- a/crates/apollo_dashboard/Cargo.toml +++ b/crates/apollo_dashboard/Cargo.toml @@ -40,8 +40,6 @@ indexmap = { workspace = true, features = ["serde"] } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = ["arbitrary_precision"] } serde_with.workspace = true -strum.workspace = true -strum_macros.workspace = true [dev-dependencies] apollo_batcher = { workspace = true, features = ["testing"] } diff --git a/crates/apollo_dashboard/resources/dev_grafana_alerts_testnet.json b/crates/apollo_dashboard/resources/dev_grafana_alerts.json similarity index 99% rename from crates/apollo_dashboard/resources/dev_grafana_alerts_testnet.json rename to crates/apollo_dashboard/resources/dev_grafana_alerts.json index 1a50a9d4b6b..ea33c632faa 100644 --- a/crates/apollo_dashboard/resources/dev_grafana_alerts_testnet.json +++ b/crates/apollo_dashboard/resources/dev_grafana_alerts.json @@ -977,7 +977,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p2", "observer_applicable": "true" }, { @@ -1005,7 +1005,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p3", "observer_applicable": "false" }, { @@ -1033,7 +1033,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p3", + "severity": "p1", "observer_applicable": "false" }, { @@ -1089,7 +1089,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p1", "observer_applicable": "true" }, { @@ -1145,7 +1145,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p2", "observer_applicable": "true" }, { @@ -1173,7 +1173,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p1", "observer_applicable": "false" }, { @@ -1201,7 +1201,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p1", "observer_applicable": "false" }, { @@ -1229,7 +1229,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p3", "observer_applicable": "false" }, { @@ -1369,7 +1369,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p3", "observer_applicable": "false" }, { @@ -1397,7 +1397,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p2", + "severity": "p1", "observer_applicable": "false" }, { @@ -1425,7 +1425,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p2", "observer_applicable": "false" }, { @@ -1453,7 +1453,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p3", "observer_applicable": "false" }, { @@ -1493,7 +1493,7 @@ { "evaluator": { "params": [ - 0.6 + 0.3 ], "type": "gt" }, @@ -1509,7 +1509,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p2", + "severity": "p1", "observer_applicable": "false" }, { @@ -1537,7 +1537,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p3", "observer_applicable": "false" }, { @@ -1565,7 +1565,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p3", "observer_applicable": "false" }, { @@ -1593,7 +1593,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p3", + "severity": "p1", "observer_applicable": "false" }, { @@ -1621,7 +1621,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p3", + "severity": "p2", "observer_applicable": "false" }, { @@ -1649,7 +1649,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p2", "observer_applicable": "false" }, { @@ -1677,7 +1677,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p3", "observer_applicable": "false" }, { @@ -1705,7 +1705,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p3", "observer_applicable": "false" }, { @@ -1733,7 +1733,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p4", + "severity": "p1", "observer_applicable": "false" }, { @@ -1761,7 +1761,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p3", + "severity": "p2", "observer_applicable": "false" }, { @@ -1789,7 +1789,7 @@ ], "for": "30s", "intervalSec": 30, - "severity": "p3", + "severity": "p2", "observer_applicable": "true" }, { diff --git a/crates/apollo_dashboard/resources/dev_grafana_alerts_mainnet.json b/crates/apollo_dashboard/resources/dev_grafana_alerts_mainnet.json deleted file mode 100644 index 5b6059bdf1a..00000000000 --- a/crates/apollo_dashboard/resources/dev_grafana_alerts_mainnet.json +++ /dev/null @@ -1,1712 +0,0 @@ -{ - "alerts": [ - { - "name": "batcher_storage_open_read_transactions", - "title": "batcher - High number of open read transactions", - "ruleGroup": "state_sync", - "expr": "max_over_time(batcher_storage_open_read_transactions{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1m])", - "conditions": [ - { - "evaluator": { - "params": [ - 7500.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p4", - "observer_applicable": "false" - }, - { - "name": "class_manager_storage_open_read_transactions", - "title": "class_manager - High number of open read transactions", - "ruleGroup": "state_sync", - "expr": "max_over_time(class_manager_storage_open_read_transactions{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1m])", - "conditions": [ - { - "evaluator": { - "params": [ - 7500.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p4", - "observer_applicable": "false" - }, - { - "name": "consensus_proposal_fin_mismatch_once", - "title": "Consensus proposal fin mismatch occurred", - "ruleGroup": "consensus", - "expr": "sum(increase(consensus_proposal_fin_mismatch{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p4", - "observer_applicable": "false" - }, - { - "name": "cende_write_blob_failure_once", - "title": "Cende write blob failure once", - "ruleGroup": "consensus", - "expr": "increase(cende_write_blob_failure{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h])", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p5", - "observer_applicable": "false" - }, - { - "name": "cende_write_prev_height_blob_latency_too_high", - "title": "Cende write prev height blob latency too high", - "ruleGroup": "consensus", - "expr": "(sum(rate(cende_write_prev_height_blob_latency_sum{cluster=~\"$cluster\", namespace=~\"$namespace\"}[20m])) or vector(0)) / clamp_min(sum(rate(cende_write_prev_height_blob_latency_count{cluster=~\"$cluster\", namespace=~\"$namespace\"}[20m])) or vector(0), 0.0000001)", - "conditions": [ - { - "evaluator": { - "params": [ - 3.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p4", - "observer_applicable": "false" - }, - { - "name": "consensus_conflicting_votes", - "title": "Consensus conflicting votes", - "ruleGroup": "consensus", - "expr": "sum(increase(consensus_conflicting_votes{cluster=~\"$cluster\", namespace=~\"$namespace\"}[20m])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p4", - "observer_applicable": "false" - }, - { - "name": "consensus_decisions_reached_by_consensus_ratio", - "title": "Consensus decisions reached by consensus ratio", - "ruleGroup": "consensus", - "expr": "(sum(increase(consensus_decisions_reached_by_consensus{cluster=~\"$cluster\", namespace=~\"$namespace\"}[10m])) or vector(0)) / clamp_min((sum(increase(consensus_decisions_reached_by_sync{cluster=~\"$cluster\", namespace=~\"$namespace\"}[10m])) or vector(0)) + (sum(increase(consensus_decisions_reached_by_consensus{cluster=~\"$cluster\", namespace=~\"$namespace\"}[10m])) or vector(0)), 1)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.5 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p4", - "observer_applicable": "false" - }, - { - "name": "consensus_inbound_stream_evicted", - "title": "Consensus inbound stream evicted", - "ruleGroup": "consensus", - "expr": "sum(increase(consensus_inbound_stream_evicted{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 5.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p5", - "observer_applicable": "false" - }, - { - "name": "consensus_l1_gas_price_provider_failure", - "title": "Consensus L1 gas price provider failure", - "ruleGroup": "consensus", - "expr": "sum(increase(consensus_l1_gas_price_provider_error{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 5.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p4", - "observer_applicable": "false" - }, - { - "name": "consensus_l1_gas_price_provider_failure_once", - "title": "Consensus L1 gas price provider failure once", - "ruleGroup": "consensus", - "expr": "sum(increase(consensus_l1_gas_price_provider_error{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p5", - "observer_applicable": "false" - }, - { - "name": "consensus_p2p_disconnections", - "title": "Consensus p2p disconnections", - "ruleGroup": "consensus", - "expr": "(sum(changes(apollo_consensus_num_connected_peers{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h])) or vector(0)) / 2", - "conditions": [ - { - "evaluator": { - "params": [ - 10.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p4", - "observer_applicable": "true" - }, - { - "name": "consensus_retrospective_block_hash_from_state_sync", - "title": "Consensus retrospective block hash retrieved from State Sync (instead of the Batcher)", - "ruleGroup": "consensus", - "expr": "sum(increase(consensus_retrospective_block_hash_from_state_sync{cluster=~\"$cluster\", namespace=~\"$namespace\"}[5m])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p5", - "observer_applicable": "true" - }, - { - "name": "consensus_round_above_zero", - "title": "Consensus round above zero", - "ruleGroup": "consensus", - "expr": "increase(consensus_round_above_zero{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h])", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p5", - "observer_applicable": "false" - }, - { - "name": "consensus_votes_num_sent_messages", - "title": "Consensus votes num sent messages", - "ruleGroup": "consensus", - "expr": "sum(increase(apollo_consensus_votes_num_sent_messages{cluster=~\"$cluster\", namespace=~\"$namespace\"}[20m])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 20.0 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p5", - "observer_applicable": "false" - }, - { - "name": "eth_to_strk_error_count", - "title": "Eth to Strk error count", - "ruleGroup": "l1_gas_price", - "expr": "sum(increase(eth_to_strk_error_count{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 10.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "1m", - "intervalSec": 20, - "severity": "p5", - "observer_applicable": "false" - }, - { - "name": "gateway_add_tx_idle_p2p_rpc", - "title": "Gateway add_tx idle (p2p+rpc)", - "ruleGroup": "gateway", - "expr": "sum(increase(gateway_transactions_received{cluster=~\"$cluster\", namespace=~\"$namespace\"}[$$$gateway_add_tx_idle_p2p_rpc-sampling_window_secs-expression$$$s])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.1 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p2", - "observer_applicable": "false" - }, - { - "name": "pod_state_not_ready", - "title": "Pod State Not Ready", - "ruleGroup": "general", - "expr": "kube_pod_container_status_ready{cluster=~\"$cluster\", namespace=~\"$namespace\"}", - "conditions": [ - { - "evaluator": { - "params": [ - 1.0 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p2", - "observer_applicable": "false" - }, - { - "name": "pod_state_crashloopbackoff", - "title": "Pod State CrashLoopBackOff", - "ruleGroup": "general", - "expr": "sum by(container, pod, namespace) (kube_pod_container_status_waiting_reason{cluster=~\"$cluster\", namespace=~\"$namespace\", reason=\"CrashLoopBackOff\"}) or absent(kube_pod_container_status_waiting_reason{cluster=~\"$cluster\", namespace=~\"$namespace\", reason=\"CrashLoopBackOff\"}) * 0", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p2", - "observer_applicable": "true" - }, - { - "name": "pod_high_cpu_utilization", - "title": "Pod High CPU Utilization ( >90% )", - "ruleGroup": "general", - "expr": "max(irate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[2m])) by (container, pod, namespace) / (max(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\"}/100000) by (container, pod, namespace)) * 100", - "conditions": [ - { - "evaluator": { - "params": [ - 90.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p2", - "observer_applicable": "true" - }, - { - "name": "http_server_high_deprecated_transaction_failure_ratio", - "title": "http server high deprecated transaction failure ratio", - "ruleGroup": "http_server", - "expr": "increase(http_server_added_transactions_deprecated_error{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h]) / clamp_min(increase(http_server_added_transactions_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h]), 1)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.1 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p5", - "observer_applicable": "false" - }, - { - "name": "http_server_high_transaction_failure_ratio", - "title": "http server high transaction failure ratio", - "ruleGroup": "http_server", - "expr": "(increase(http_server_added_transactions_failure{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h]) - increase(http_server_added_transactions_deprecated_error{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h])) / clamp_min(increase(http_server_added_transactions_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h]), 1)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.5 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p5", - "observer_applicable": "false" - }, - { - "name": "http_server_internal_error_once", - "title": "http server internal error once", - "ruleGroup": "http_server", - "expr": "increase(http_server_added_transactions_internal_error{cluster=~\"$cluster\", namespace=~\"$namespace\"}[20m]) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p4", - "observer_applicable": "false" - }, - { - "name": "http_server_no_successful_transactions", - "title": "http server no successful transactions", - "ruleGroup": "http_server", - "expr": "sum(increase(http_server_added_transactions_success{cluster=~\"$cluster\", namespace=~\"$namespace\"}[$$$http_server_no_successful_transactions-sampling_window_secs-expression$$$s])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.1 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p5", - "observer_applicable": "false" - }, - { - "name": "l1_gas_price_scraper_reorg_detected", - "title": "L1 gas price scraper reorg detected", - "ruleGroup": "l1_gas_price", - "expr": "sum(increase(l1_gas_price_scraper_reorg_detected{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1m])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p5", - "observer_applicable": "false" - }, - { - "name": "l1_gas_price_scraper_baselayer_error_count", - "title": "L1 gas price scraper baselayer error count", - "ruleGroup": "l1_gas_price", - "expr": "sum(increase(l1_gas_price_scraper_baselayer_error_count{cluster=~\"$cluster\", namespace=~\"$namespace\"}[5m])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p5", - "observer_applicable": "false" - }, - { - "name": "l1_message_scraper_baselayer_error_count", - "title": "L1 message scraper baselayer error count", - "ruleGroup": "l1_messages", - "expr": "sum(increase(l1_message_scraper_baselayer_error_count{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 5.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p5", - "observer_applicable": "false" - }, - { - "name": "l1_message_scraper_reorg_detected", - "title": "L1 message scraper reorg detected", - "ruleGroup": "l1_messages", - "expr": "sum(increase(l1_message_scraper_reorg_detected{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1m])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p1", - "observer_applicable": "false" - }, - { - "name": "mempool_add_tx_idle_p2p_rpc", - "title": "Mempool add_tx idle (p2p+rpc)", - "ruleGroup": "mempool", - "expr": "sum(increase(mempool_transactions_received{cluster=~\"$cluster\", namespace=~\"$namespace\"}[$$$mempool_add_tx_idle_p2p_rpc-sampling_window_secs-expression$$$s])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.1 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p1", - "observer_applicable": "false" - }, - { - "name": "mempool_p2p_disconnections", - "title": "Mempool p2p disconnections", - "ruleGroup": "mempool", - "expr": "(sum(changes(apollo_mempool_p2p_num_connected_peers{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h])) or vector(0)) / 2", - "conditions": [ - { - "evaluator": { - "params": [ - 10.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p4", - "observer_applicable": "false" - }, - { - "name": "native_compilation_error", - "title": "Native compilation alert", - "ruleGroup": "batcher", - "expr": "sum(increase(native_compilation_error{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p4", - "observer_applicable": "false" - }, - { - "name": "periodic_ping", - "title": "Periodic Ping", - "ruleGroup": "general", - "expr": "(day_of_week() == bool 0) * (hour() == bool 7) * (minute() == bool 55)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p2", - "observer_applicable": "true" - }, - { - "name": "sync_storage_open_read_transactions", - "title": "sync - High number of open read transactions", - "ruleGroup": "state_sync", - "expr": "max_over_time(sync_storage_open_read_transactions{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1m])", - "conditions": [ - { - "evaluator": { - "params": [ - 7500.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p4", - "observer_applicable": "false" - }, - { - "name": "batched_transactions_stuck", - "title": "Batched transactions stuck", - "ruleGroup": "batcher", - "expr": "changes(batcher_batched_transactions{cluster=~\"$cluster\", namespace=~\"$namespace\"}[$$$batched_transactions_stuck-sampling_window_secs-expression$$$s])", - "conditions": [ - { - "evaluator": { - "params": [ - 1.0 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "$$$batched_transactions_stuck-severity$$$", - "observer_applicable": "false" - }, - { - "name": "get_consensus_block_number_progress_is_slow", - "title": "Consensus block number progress is slow", - "ruleGroup": "consensus", - "expr": "sum(increase(consensus_block_number{cluster=~\"$cluster\", namespace=~\"$namespace\"}[2m])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - "$$$get_consensus_block_number_progress_is_slow-comparison_value$$$" - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p2", - "observer_applicable": "true" - }, - { - "name": "cende_write_blob_failure", - "title": "Cende write blob failure", - "ruleGroup": "consensus", - "expr": "increase(cende_write_blob_failure{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h])", - "conditions": [ - { - "evaluator": { - "params": [ - 10.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p3", - "observer_applicable": "false" - }, - { - "name": "consensus_block_number_stuck", - "title": "Consensus block number stuck", - "ruleGroup": "consensus", - "expr": "sum(increase(consensus_block_number{cluster=~\"$cluster\", namespace=~\"$namespace\"}[$$$consensus_block_number_stuck-sampling_window_secs-expression$$$s])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 1.0 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p1", - "observer_applicable": "false" - }, - { - "name": "consensus_p2p_not_enough_peers_for_quorum", - "title": "Consensus p2p not enough peers for quorum", - "ruleGroup": "consensus", - "expr": "max_over_time(apollo_consensus_num_connected_peers{cluster=~\"$cluster\", namespace=~\"$namespace\"}[120s])", - "conditions": [ - { - "evaluator": { - "params": [ - 1.0 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p1", - "observer_applicable": "true" - }, - { - "name": "consensus_p2p_peer_down", - "title": "Consensus p2p peer down", - "ruleGroup": "consensus", - "expr": "max_over_time(apollo_consensus_num_connected_peers{cluster=~\"$cluster\", namespace=~\"$namespace\"}[2m])", - "conditions": [ - { - "evaluator": { - "params": [ - 2.0 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p2", - "observer_applicable": "true" - }, - { - "name": "consensus_round_above_zero_multiple_times", - "title": "Consensus round above zero multiple times", - "ruleGroup": "consensus", - "expr": "increase(consensus_round_above_zero{cluster=~\"$cluster\", namespace=~\"$namespace\"}[$$$consensus_round_above_zero_multiple_times-sampling_window_secs-expression$$$s])", - "conditions": [ - { - "evaluator": { - "params": [ - "$$$consensus_round_above_zero_multiple_times-comparison_value$$$" - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p1", - "observer_applicable": "false" - }, - { - "name": "consensus_round_high", - "title": "Consensus round high", - "ruleGroup": "consensus", - "expr": "max_over_time(consensus_round{cluster=~\"$cluster\", namespace=~\"$namespace\"}[2m])", - "conditions": [ - { - "evaluator": { - "params": [ - "$$$consensus_round_high-comparison_value$$$" - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p1", - "observer_applicable": "false" - }, - { - "name": "eth_to_strk_success_count", - "title": "Eth to Strk success count", - "ruleGroup": "l1_gas_price", - "expr": "increase(eth_to_strk_success_count{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h])", - "conditions": [ - { - "evaluator": { - "params": [ - 1.0 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p3", - "observer_applicable": "false" - }, - { - "name": "pod_state_high_memory_utilization", - "title": "Pod High Memory Utilization ( >70% )", - "ruleGroup": "general", - "expr": "max(container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}) by (container, pod, namespace) / max(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}) by (container, pod, namespace) * 100", - "conditions": [ - { - "evaluator": { - "params": [ - 70.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p3", - "observer_applicable": "true" - }, - { - "name": "pod_state_critical_memory_utilization", - "title": "Pod Critical Memory Utilization ( >85% )", - "ruleGroup": "general", - "expr": "max(container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}) by (container, pod, namespace) / max(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}) by (container, pod, namespace) * 100", - "conditions": [ - { - "evaluator": { - "params": [ - 85.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p2", - "observer_applicable": "true" - }, - { - "name": "pod_state_high_disk_utilization", - "title": "Pod High Disk Utilization ( >70% )", - "ruleGroup": "general", - "expr": "max by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}) / (min by (namespace,persistentvolumeclaim) (kubelet_volume_stats_available_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}) + max by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}))*100", - "conditions": [ - { - "evaluator": { - "params": [ - 70.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p3", - "observer_applicable": "true" - }, - { - "name": "pod_state_critical_disk_utilization", - "title": "Pod Critical Disk Utilization ( >90% )", - "ruleGroup": "general", - "expr": "max by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}) / (min by (namespace,persistentvolumeclaim) (kubelet_volume_stats_available_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}) + max by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}))*100", - "conditions": [ - { - "evaluator": { - "params": [ - 90.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p2", - "observer_applicable": "true" - }, - { - "name": "http_server_avg_add_tx_latency", - "title": "High HTTP server average add_tx latency", - "ruleGroup": "http_server", - "expr": "rate(http_server_add_tx_latency_sum{cluster=~\"$cluster\", namespace=~\"$namespace\"}[5m]) / clamp_min(rate(http_server_add_tx_latency_count{cluster=~\"$cluster\", namespace=~\"$namespace\"}[5m]), 1/300)", - "conditions": [ - { - "evaluator": { - "params": [ - 15.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p3", - "observer_applicable": "false" - }, - { - "name": "http_server_min_add_tx_latency", - "title": "High HTTP server minimal add_tx latency", - "ruleGroup": "http_server", - "expr": "(sum(increase(http_server_add_tx_latency_count{cluster=~\"$cluster\", namespace=~\"$namespace\"}[2m])) > 0) * (sum(increase(http_server_add_tx_latency_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", le=\"1.0\"}[2m])) < 1)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p1", - "observer_applicable": "false" - }, - { - "name": "http_server_internal_error_ratio", - "title": "http server internal error ratio", - "ruleGroup": "http_server", - "expr": "increase(http_server_added_transactions_internal_error{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h]) / clamp_min(increase(http_server_added_transactions_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h]), 1)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.01 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p2", - "observer_applicable": "false" - }, - { - "name": "gateway_low_successful_transaction_rate", - "title": "gateway low successful transaction rate", - "ruleGroup": "gateway", - "expr": "sum(increase(gateway_transactions_sent_to_mempool{cluster=~\"$cluster\", namespace=~\"$namespace\"}[10m])) or vector(0)", - "conditions": [ - { - "evaluator": { - "params": [ - 5.0 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p3", - "observer_applicable": "false" - }, - { - "name": "http_server_p95_add_tx_latency", - "title": "High HTTP server P95 add_tx latency", - "ruleGroup": "http_server", - "expr": "histogram_quantile(0.95, sum(rate(http_server_add_tx_latency_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\"}[5m])) by (le))", - "conditions": [ - { - "evaluator": { - "params": [ - 2.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p5", - "observer_applicable": "false" - }, - { - "name": "high_empty_blocks_ratio", - "title": "High ratio of empty blocks", - "ruleGroup": "batcher", - "expr": "sum(increase(batcher_num_transaction_in_block_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", le=\"0.001\"}[$$$high_empty_blocks_ratio-zero_bucket-sampling_window_secs-expression$$$s])) / clamp_min(sum(increase(batcher_num_transaction_in_block_count{cluster=~\"$cluster\", namespace=~\"$namespace\"}[$$$high_empty_blocks_ratio-total_count-sampling_window_secs-expression$$$s])), 1)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.3 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p1", - "observer_applicable": "false" - }, - { - "name": "l1_gas_price_provider_insufficient_history", - "title": "L1 gas price provider insufficient history", - "ruleGroup": "l1_gas_price", - "expr": "increase(l1_gas_price_provider_insufficient_history{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1m])", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p3", - "observer_applicable": "false" - }, - { - "name": "l1_gas_price_scraper_success_count", - "title": "L1 gas price scraper success count", - "ruleGroup": "l1_gas_price", - "expr": "increase(l1_gas_price_scraper_success_count{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h])", - "conditions": [ - { - "evaluator": { - "params": [ - 1.0 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p3", - "observer_applicable": "false" - }, - { - "name": "l1_message_no_successes", - "title": "L1 message no successes", - "ruleGroup": "l1_gas_price", - "expr": "increase(l1_message_scraper_success_count{cluster=~\"$cluster\", namespace=~\"$namespace\"}[5m])", - "conditions": [ - { - "evaluator": { - "params": [ - 1.0 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p1", - "observer_applicable": "false" - }, - { - "name": "mempool_evictions_count", - "title": "Mempool evictions count", - "ruleGroup": "mempool", - "expr": "mempool_transactions_dropped{cluster=~\"$cluster\", namespace=~\"$namespace\", drop_reason=\"evicted\"}", - "conditions": [ - { - "evaluator": { - "params": [ - 0.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p2", - "observer_applicable": "false" - }, - { - "name": "mempool_p2p_peer_down", - "title": "Mempool p2p peer down", - "ruleGroup": "mempool", - "expr": "max_over_time(apollo_mempool_p2p_num_connected_peers{cluster=~\"$cluster\", namespace=~\"$namespace\"}[2m])", - "conditions": [ - { - "evaluator": { - "params": [ - 2.0 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p2", - "observer_applicable": "false" - }, - { - "name": "mempool_pool_size_increase", - "title": "Mempool pool size increase", - "ruleGroup": "mempool", - "expr": "mempool_pool_size{cluster=~\"$cluster\", namespace=~\"$namespace\"}", - "conditions": [ - { - "evaluator": { - "params": [ - 10000.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p3", - "observer_applicable": "false" - }, - { - "name": "mempool_transaction_drop_ratio", - "title": "Mempool transaction drop ratio", - "ruleGroup": "mempool", - "expr": "increase(mempool_transactions_dropped{cluster=~\"$cluster\", namespace=~\"$namespace\"}[10m]) / clamp_min(increase(mempool_transactions_received{cluster=~\"$cluster\", namespace=~\"$namespace\"}[10m]), 1)", - "conditions": [ - { - "evaluator": { - "params": [ - 0.2 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p3", - "observer_applicable": "false" - }, - { - "name": "preconfirmed_block_not_written", - "title": "Preconfirmed block not written", - "ruleGroup": "batcher", - "expr": "increase(batcher_preconfirmed_block_written{cluster=~\"$cluster\", namespace=~\"$namespace\"}[2m])", - "conditions": [ - { - "evaluator": { - "params": [ - 1.0 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p1", - "observer_applicable": "false" - }, - { - "name": "state_sync_lag", - "title": "State sync lag", - "ruleGroup": "state_sync", - "expr": "apollo_central_sync_central_block_marker{cluster=~\"$cluster\", namespace=~\"$namespace\"} - apollo_state_sync_class_manager_marker{cluster=~\"$cluster\", namespace=~\"$namespace\"}", - "conditions": [ - { - "evaluator": { - "params": [ - 5.0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p2", - "observer_applicable": "false" - }, - { - "name": "state_sync_stuck", - "title": "State sync stuck", - "ruleGroup": "state_sync", - "expr": "increase(apollo_state_sync_class_manager_marker{cluster=~\"$cluster\", namespace=~\"$namespace\"}[120s])", - "conditions": [ - { - "evaluator": { - "params": [ - 1.0 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "for": "30s", - "intervalSec": 30, - "severity": "p2", - "observer_applicable": "true" - } - ] -} diff --git a/crates/apollo_dashboard/src/alert_definitions.rs b/crates/apollo_dashboard/src/alert_definitions.rs index f5f21dff29e..d6e9cdbf0f2 100644 --- a/crates/apollo_dashboard/src/alert_definitions.rs +++ b/crates/apollo_dashboard/src/alert_definitions.rs @@ -90,7 +90,6 @@ use crate::alerts::{ Alert, AlertComparisonOp, AlertCondition, - AlertEnvFiltering, AlertGroup, AlertLogicalOp, AlertSeverity, @@ -100,8 +99,8 @@ use crate::alerts::{ PENDING_DURATION_DEFAULT, }; -pub fn get_dev_alerts_json_path(alert_env_filtering: AlertEnvFiltering) -> String { - format!("crates/apollo_dashboard/resources/dev_grafana_alerts_{alert_env_filtering}.json") +pub fn get_dev_alerts_json_path() -> String { + "crates/apollo_dashboard/resources/dev_grafana_alerts.json".to_string() } // TODO(guy.f): Can we have spaces in the alert names? If so, do we want to make the alert name and @@ -127,7 +126,6 @@ fn get_consensus_decisions_reached_by_consensus_ratio() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::WorkingHours, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -145,7 +143,6 @@ fn get_consensus_inbound_stream_evicted_alert() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::Informational, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -163,7 +160,6 @@ fn get_consensus_votes_num_sent_messages_alert() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::Informational, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -183,7 +179,6 @@ fn get_cende_write_prev_height_blob_latency_too_high() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::WorkingHours, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -201,7 +196,6 @@ fn get_consensus_l1_gas_price_provider_failure() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::WorkingHours, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -219,7 +213,6 @@ fn get_consensus_l1_gas_price_provider_failure_once() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::Informational, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -237,7 +230,6 @@ fn get_consensus_proposal_fin_mismatch_once() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::WorkingHours, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -256,7 +248,6 @@ fn get_consensus_conflicting_votes() -> Alert { // TODO(matan): Increase severity once slashing is supported. AlertSeverity::WorkingHours, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -274,7 +265,6 @@ fn get_consensus_retrospective_block_hash_from_state_sync() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::Informational, ObserverApplicability::Applicable, - AlertEnvFiltering::All, ) } @@ -292,7 +282,6 @@ fn get_eth_to_strk_error_count_alert() -> Alert { 20, AlertSeverity::Informational, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -310,7 +299,6 @@ fn get_l1_gas_price_scraper_baselayer_error_count_alert() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::Informational, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -328,7 +316,6 @@ fn get_l1_gas_price_reorg_detected_alert() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::Informational, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -346,7 +333,6 @@ fn get_l1_message_scraper_baselayer_error_count_alert() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::Informational, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -364,7 +350,6 @@ fn get_l1_message_scraper_reorg_detected_alert() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::Sos, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -382,7 +367,6 @@ fn get_native_compilation_error_increase() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::WorkingHours, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -403,7 +387,6 @@ fn get_consensus_p2p_disconnections() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::WorkingHours, ObserverApplicability::Applicable, - AlertEnvFiltering::All, ) } @@ -424,7 +407,6 @@ fn get_mempool_p2p_disconnections() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::WorkingHours, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -445,7 +427,6 @@ fn create_storage_open_read_transactions_alert(storage_type: &str, metric_name: EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::WorkingHours, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -470,7 +451,7 @@ fn get_class_manager_storage_open_read_transactions_alert() -> Alert { ) } -pub fn get_apollo_alerts(alert_env_filtering: AlertEnvFiltering) -> Alerts { +pub fn get_apollo_alerts() -> Alerts { let mut alerts = vec![ get_batcher_storage_open_read_transactions_alert(), get_class_manager_storage_open_read_transactions_alert(), @@ -534,5 +515,5 @@ pub fn get_apollo_alerts(alert_env_filtering: AlertEnvFiltering) -> Alerts { alerts.append(&mut get_state_sync_lag_vec()); alerts.append(&mut get_state_sync_stuck_vec()); - Alerts::new(alerts, alert_env_filtering) + Alerts::new(alerts) } diff --git a/crates/apollo_dashboard/src/alert_scenarios/block_production_delay.rs b/crates/apollo_dashboard/src/alert_scenarios/block_production_delay.rs index dce2ee5a6a3..5286379516b 100644 --- a/crates/apollo_dashboard/src/alert_scenarios/block_production_delay.rs +++ b/crates/apollo_dashboard/src/alert_scenarios/block_production_delay.rs @@ -13,7 +13,6 @@ use crate::alerts::{ Alert, AlertComparisonOp, AlertCondition, - AlertEnvFiltering, AlertGroup, AlertLogicalOp, AlertSeverity, @@ -34,14 +33,10 @@ pub(crate) fn get_consensus_round_above_zero() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::Informational, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } -fn get_consensus_round_above_zero_multiple_times( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_consensus_round_above_zero_multiple_times(alert_severity: AlertSeverity) -> Alert { const ALERT_NAME: &str = "consensus_round_above_zero_multiple_times"; let expr_template_string = format!("increase({}[{{}}s])", CONSENSUS_ROUND_ABOVE_ZERO.get_name_with_filter()); @@ -62,27 +57,14 @@ fn get_consensus_round_above_zero_multiple_times( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_consensus_round_above_zero_multiple_times_vec() -> Vec { - vec![ - get_consensus_round_above_zero_multiple_times( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::Sos, - ), - get_consensus_round_above_zero_multiple_times( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::WorkingHours, - ), - ] + vec![get_consensus_round_above_zero_multiple_times(AlertSeverity::Sos)] } -fn get_cende_write_blob_failure_alert( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_cende_write_blob_failure_alert(alert_severity: AlertSeverity) -> Alert { Alert::new( "cende_write_blob_failure", "Cende write blob failure", @@ -93,27 +75,14 @@ fn get_cende_write_blob_failure_alert( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_cende_write_blob_failure_alert_vec() -> Vec { - vec![ - get_cende_write_blob_failure_alert( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::DayOnly, - ), - get_cende_write_blob_failure_alert( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::WorkingHours, - ), - ] + vec![get_cende_write_blob_failure_alert(AlertSeverity::DayOnly)] } -fn get_consensus_p2p_peer_down( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_consensus_p2p_peer_down(alert_severity: AlertSeverity) -> Alert { Alert::new( "consensus_p2p_peer_down", "Consensus p2p peer down", @@ -129,18 +98,11 @@ fn get_consensus_p2p_peer_down( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::Applicable, - alert_env_filtering, ) } pub(crate) fn get_consensus_p2p_peer_down_vec() -> Vec { - vec![ - get_consensus_p2p_peer_down(AlertEnvFiltering::MainnetStyleAlerts, AlertSeverity::Regular), - get_consensus_p2p_peer_down( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::WorkingHours, - ), - ] + vec![get_consensus_p2p_peer_down(AlertSeverity::Regular)] } pub(crate) fn get_cende_write_blob_failure_once_alert() -> Alert { @@ -154,14 +116,10 @@ pub(crate) fn get_cende_write_blob_failure_once_alert() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::Informational, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } -fn get_consensus_block_number_progress_is_slow( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_consensus_block_number_progress_is_slow(alert_severity: AlertSeverity) -> Alert { const ALERT_NAME: &str = "get_consensus_block_number_progress_is_slow"; Alert::new( ALERT_NAME, @@ -180,19 +138,9 @@ fn get_consensus_block_number_progress_is_slow( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::Applicable, - alert_env_filtering, ) } pub(crate) fn get_consensus_block_number_progress_is_slow_vec() -> Vec { - vec![ - get_consensus_block_number_progress_is_slow( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::Regular, - ), - get_consensus_block_number_progress_is_slow( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::WorkingHours, - ), - ] + vec![get_consensus_block_number_progress_is_slow(AlertSeverity::Regular)] } diff --git a/crates/apollo_dashboard/src/alert_scenarios/block_production_halt.rs b/crates/apollo_dashboard/src/alert_scenarios/block_production_halt.rs index 78262405bf0..dd10f395c57 100644 --- a/crates/apollo_dashboard/src/alert_scenarios/block_production_halt.rs +++ b/crates/apollo_dashboard/src/alert_scenarios/block_production_halt.rs @@ -16,7 +16,6 @@ use crate::alerts::{ Alert, AlertComparisonOp, AlertCondition, - AlertEnvFiltering, AlertGroup, AlertLogicalOp, AlertSeverity, @@ -29,7 +28,6 @@ use crate::alerts::{ /// Block number is stuck for more than duration minutes. fn get_consensus_block_number_stuck( alert_name: &'static str, - alert_env_filtering: AlertEnvFiltering, alert_severity: AlertSeverity, ) -> Alert { let expr_template_string = format!( @@ -49,38 +47,20 @@ fn get_consensus_block_number_stuck( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_consensus_block_number_stuck_vec() -> Vec { vec![ - get_consensus_block_number_stuck( - "consensus_block_number_stuck", - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::Sos, - ), - get_consensus_block_number_stuck( - "consensus_block_number_stuck", - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::DayOnly, - ), + get_consensus_block_number_stuck("consensus_block_number_stuck", AlertSeverity::Sos), get_consensus_block_number_stuck( "consensus_block_number_stuck_long_time", - AlertEnvFiltering::TestnetStyleAlerts, AlertSeverity::Regular, ), ] } -// TODO(Tsabary): settle all the required parameters that are different among envs using the -// placeholder mechanism. -// TODO(Tsabary): remove `AlertEnvFiltering` throughout and use the placeholder mechanism instead. - -fn get_batched_transactions_stuck( - alert_name: &'static str, - alert_env_filtering: AlertEnvFiltering, -) -> Alert { +fn get_batched_transactions_stuck(alert_name: &'static str) -> Alert { let expr_template_string = format!("changes({}[{{}}s])", BATCHED_TRANSACTIONS.get_name_with_filter()); Alert::new( @@ -96,30 +76,18 @@ fn get_batched_transactions_stuck( EVALUATION_INTERVAL_SEC_DEFAULT, SeverityValueOrPlaceholder::Placeholder(alert_name.to_string()), ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_batched_transactions_stuck_vec() -> Vec { vec![ - get_batched_transactions_stuck( - "batched_transactions_stuck", - AlertEnvFiltering::MainnetStyleAlerts, - ), - get_batched_transactions_stuck( - "batched_transactions_stuck", - AlertEnvFiltering::TestnetStyleAlerts, - ), - get_batched_transactions_stuck( - "batched_transactions_stuck_long_time", - AlertEnvFiltering::TestnetStyleAlerts, - ), + get_batched_transactions_stuck("batched_transactions_stuck"), + get_batched_transactions_stuck("batched_transactions_stuck_long_time"), ] } fn get_consensus_p2p_not_enough_peers_for_quorum( alert_name: &'static str, - alert_env_filtering: AlertEnvFiltering, duration: Duration, alert_severity: AlertSeverity, ) -> Alert { @@ -143,7 +111,6 @@ fn get_consensus_p2p_not_enough_peers_for_quorum( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::Applicable, - alert_env_filtering, ) } @@ -151,29 +118,18 @@ pub(crate) fn get_consensus_p2p_not_enough_peers_for_quorum_vec() -> Vec vec![ get_consensus_p2p_not_enough_peers_for_quorum( "consensus_p2p_not_enough_peers_for_quorum", - AlertEnvFiltering::MainnetStyleAlerts, Duration::from_secs(2 * SECS_IN_MIN), AlertSeverity::Sos, ), - get_consensus_p2p_not_enough_peers_for_quorum( - "consensus_p2p_not_enough_peers_for_quorum", - AlertEnvFiltering::TestnetStyleAlerts, - Duration::from_secs(2 * SECS_IN_MIN), - AlertSeverity::WorkingHours, - ), get_consensus_p2p_not_enough_peers_for_quorum( "consensus_p2p_not_enough_peers_for_quorum_long_time", - AlertEnvFiltering::TestnetStyleAlerts, Duration::from_secs(30 * SECS_IN_MIN), AlertSeverity::Regular, ), ] } -fn get_consensus_round_high( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_consensus_round_high(alert_severity: AlertSeverity) -> Alert { const ALERT_NAME: &str = "consensus_round_high"; Alert::new( ALERT_NAME, @@ -189,16 +145,9 @@ fn get_consensus_round_high( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_consensus_round_high_vec() -> Vec { - vec![ - get_consensus_round_high(AlertEnvFiltering::MainnetStyleAlerts, AlertSeverity::Sos), - get_consensus_round_high( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::WorkingHours, - ), - ] + vec![get_consensus_round_high(AlertSeverity::Sos)] } diff --git a/crates/apollo_dashboard/src/alert_scenarios/infra_alerts.rs b/crates/apollo_dashboard/src/alert_scenarios/infra_alerts.rs index 1058858afd4..d1d971af60e 100644 --- a/crates/apollo_dashboard/src/alert_scenarios/infra_alerts.rs +++ b/crates/apollo_dashboard/src/alert_scenarios/infra_alerts.rs @@ -4,7 +4,6 @@ use crate::alerts::{ Alert, AlertComparisonOp, AlertCondition, - AlertEnvFiltering, AlertGroup, AlertLogicalOp, AlertSeverity, @@ -26,7 +25,6 @@ pub(crate) fn get_general_pod_state_not_ready() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::Regular, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -53,7 +51,6 @@ pub(crate) fn get_general_pod_state_crashloopbackoff() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::Regular, ObserverApplicability::Applicable, - AlertEnvFiltering::All, ) } @@ -85,7 +82,6 @@ fn get_general_pod_memory_utilization( EVALUATION_INTERVAL_SEC_DEFAULT, severity, ObserverApplicability::Applicable, - AlertEnvFiltering::All, ) } @@ -124,7 +120,6 @@ pub(crate) fn get_general_pod_high_cpu_utilization() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::Regular, ObserverApplicability::Applicable, - AlertEnvFiltering::All, ) } @@ -155,7 +150,6 @@ fn get_general_pod_disk_utilization( EVALUATION_INTERVAL_SEC_DEFAULT, severity, ObserverApplicability::Applicable, - AlertEnvFiltering::All, ) } @@ -190,6 +184,5 @@ pub(crate) fn get_periodic_ping() -> Alert { 30, AlertSeverity::Regular, ObserverApplicability::Applicable, - AlertEnvFiltering::All, ) } diff --git a/crates/apollo_dashboard/src/alert_scenarios/l1_gas_prices.rs b/crates/apollo_dashboard/src/alert_scenarios/l1_gas_prices.rs index bf18ed4f9df..4e9b016321a 100644 --- a/crates/apollo_dashboard/src/alert_scenarios/l1_gas_prices.rs +++ b/crates/apollo_dashboard/src/alert_scenarios/l1_gas_prices.rs @@ -9,7 +9,6 @@ use crate::alerts::{ Alert, AlertComparisonOp, AlertCondition, - AlertEnvFiltering, AlertGroup, AlertLogicalOp, AlertSeverity, @@ -19,10 +18,7 @@ use crate::alerts::{ }; /// Alert if we have no successful eth to strk rates data from the last hour. -fn get_eth_to_strk_success_count_alert( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_eth_to_strk_success_count_alert(alert_severity: AlertSeverity) -> Alert { Alert::new( "eth_to_strk_success_count", "Eth to Strk success count", @@ -33,28 +29,15 @@ fn get_eth_to_strk_success_count_alert( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_eth_to_strk_success_count_alert_vec() -> Vec { - vec![ - get_eth_to_strk_success_count_alert( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::DayOnly, - ), - get_eth_to_strk_success_count_alert( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::WorkingHours, - ), - ] + vec![get_eth_to_strk_success_count_alert(AlertSeverity::DayOnly)] } /// Alert if had no successful l1 gas price scrape in the last hour. -fn get_l1_gas_price_scraper_success_count_alert( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_l1_gas_price_scraper_success_count_alert(alert_severity: AlertSeverity) -> Alert { Alert::new( "l1_gas_price_scraper_success_count", "L1 gas price scraper success count", @@ -65,27 +48,14 @@ fn get_l1_gas_price_scraper_success_count_alert( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_l1_gas_price_scraper_success_count_alert_vec() -> Vec { - vec![ - get_l1_gas_price_scraper_success_count_alert( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::DayOnly, - ), - get_l1_gas_price_scraper_success_count_alert( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::WorkingHours, - ), - ] + vec![get_l1_gas_price_scraper_success_count_alert(AlertSeverity::DayOnly)] } -fn get_l1_gas_price_provider_insufficient_history_alert( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_l1_gas_price_provider_insufficient_history_alert(alert_severity: AlertSeverity) -> Alert { Alert::new( "l1_gas_price_provider_insufficient_history", "L1 gas price provider insufficient history", @@ -99,19 +69,9 @@ fn get_l1_gas_price_provider_insufficient_history_alert( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_l1_gas_price_provider_insufficient_history_alert_vec() -> Vec { - vec![ - get_l1_gas_price_provider_insufficient_history_alert( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::DayOnly, - ), - get_l1_gas_price_provider_insufficient_history_alert( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::WorkingHours, - ), - ] + vec![get_l1_gas_price_provider_insufficient_history_alert(AlertSeverity::DayOnly)] } diff --git a/crates/apollo_dashboard/src/alert_scenarios/l1_handlers.rs b/crates/apollo_dashboard/src/alert_scenarios/l1_handlers.rs index 04f2dbc55c8..123d9ea2310 100644 --- a/crates/apollo_dashboard/src/alert_scenarios/l1_handlers.rs +++ b/crates/apollo_dashboard/src/alert_scenarios/l1_handlers.rs @@ -5,7 +5,6 @@ use crate::alerts::{ Alert, AlertComparisonOp, AlertCondition, - AlertEnvFiltering, AlertGroup, AlertLogicalOp, AlertSeverity, @@ -14,10 +13,7 @@ use crate::alerts::{ PENDING_DURATION_DEFAULT, }; -fn get_l1_message_scraper_no_successes_alert( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_l1_message_scraper_no_successes_alert(alert_severity: AlertSeverity) -> Alert { Alert::new( "l1_message_no_successes", "L1 message no successes", @@ -28,19 +24,9 @@ fn get_l1_message_scraper_no_successes_alert( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_l1_message_scraper_no_successes_alert_vec() -> Vec { - vec![ - get_l1_message_scraper_no_successes_alert( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::Sos, - ), - get_l1_message_scraper_no_successes_alert( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::DayOnly, - ), - ] + vec![get_l1_message_scraper_no_successes_alert(AlertSeverity::Sos)] } diff --git a/crates/apollo_dashboard/src/alert_scenarios/mempool_size.rs b/crates/apollo_dashboard/src/alert_scenarios/mempool_size.rs index de3004d2b9d..7e91cb6d418 100644 --- a/crates/apollo_dashboard/src/alert_scenarios/mempool_size.rs +++ b/crates/apollo_dashboard/src/alert_scenarios/mempool_size.rs @@ -10,7 +10,6 @@ use crate::alerts::{ Alert, AlertComparisonOp, AlertCondition, - AlertEnvFiltering, AlertGroup, AlertLogicalOp, AlertSeverity, @@ -19,10 +18,7 @@ use crate::alerts::{ PENDING_DURATION_DEFAULT, }; -fn get_mempool_pool_size_increase( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_mempool_pool_size_increase(alert_severity: AlertSeverity) -> Alert { Alert::new( "mempool_pool_size_increase", "Mempool pool size increase", @@ -33,27 +29,14 @@ fn get_mempool_pool_size_increase( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_mempool_pool_size_increase_vec() -> Vec { - vec![ - get_mempool_pool_size_increase( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::DayOnly, - ), - get_mempool_pool_size_increase( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::WorkingHours, - ), - ] + vec![get_mempool_pool_size_increase(AlertSeverity::DayOnly)] } -fn get_mempool_evictions_count_alert( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_mempool_evictions_count_alert(alert_severity: AlertSeverity) -> Alert { let evicted_label: &str = DropReason::Evicted.into(); let query_expr = MEMPOOL_TRANSACTIONS_DROPPED.get_name_with_filer_and_additional_fields( @@ -70,19 +53,9 @@ fn get_mempool_evictions_count_alert( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_mempool_evictions_count_alert_vec() -> Vec { - vec![ - get_mempool_evictions_count_alert( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::Regular, - ), - get_mempool_evictions_count_alert( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::DayOnly, - ), - ] + vec![get_mempool_evictions_count_alert(AlertSeverity::Regular)] } diff --git a/crates/apollo_dashboard/src/alert_scenarios/preconfirmed.rs b/crates/apollo_dashboard/src/alert_scenarios/preconfirmed.rs index 59d5411738c..ed5c05995ce 100644 --- a/crates/apollo_dashboard/src/alert_scenarios/preconfirmed.rs +++ b/crates/apollo_dashboard/src/alert_scenarios/preconfirmed.rs @@ -5,7 +5,6 @@ use crate::alerts::{ Alert, AlertComparisonOp, AlertCondition, - AlertEnvFiltering, AlertGroup, AlertLogicalOp, AlertSeverity, @@ -15,10 +14,7 @@ use crate::alerts::{ }; /// No preconfirmed block was written in the last 10 minutes. -fn get_preconfirmed_block_not_written( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_preconfirmed_block_not_written(alert_severity: AlertSeverity) -> Alert { Alert::new( "preconfirmed_block_not_written", "Preconfirmed block not written", @@ -29,19 +25,9 @@ fn get_preconfirmed_block_not_written( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_preconfirmed_block_not_written_vec() -> Vec { - vec![ - get_preconfirmed_block_not_written( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::Sos, - ), - get_preconfirmed_block_not_written( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::WorkingHours, - ), - ] + vec![get_preconfirmed_block_not_written(AlertSeverity::Sos)] } diff --git a/crates/apollo_dashboard/src/alert_scenarios/sync_halt.rs b/crates/apollo_dashboard/src/alert_scenarios/sync_halt.rs index 8a8b181a226..58cd99e9058 100644 --- a/crates/apollo_dashboard/src/alert_scenarios/sync_halt.rs +++ b/crates/apollo_dashboard/src/alert_scenarios/sync_halt.rs @@ -10,7 +10,6 @@ use crate::alerts::{ Alert, AlertComparisonOp, AlertCondition, - AlertEnvFiltering, AlertGroup, AlertLogicalOp, AlertSeverity, @@ -20,10 +19,7 @@ use crate::alerts::{ SECS_IN_MIN, }; -fn get_state_sync_lag( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_state_sync_lag(alert_severity: AlertSeverity) -> Alert { Alert::new( "state_sync_lag", "State sync lag", @@ -38,20 +34,15 @@ fn get_state_sync_lag( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_state_sync_lag_vec() -> Vec { - vec![ - get_state_sync_lag(AlertEnvFiltering::MainnetStyleAlerts, AlertSeverity::Regular), - get_state_sync_lag(AlertEnvFiltering::TestnetStyleAlerts, AlertSeverity::DayOnly), - ] + vec![get_state_sync_lag(AlertSeverity::Regular)] } fn get_state_sync_stuck( alert_name: &'static str, - alert_env_filtering: AlertEnvFiltering, duration: Duration, alert_severity: AlertSeverity, ) -> Alert { @@ -69,7 +60,6 @@ fn get_state_sync_stuck( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::Applicable, - alert_env_filtering, ) } @@ -77,19 +67,11 @@ pub(crate) fn get_state_sync_stuck_vec() -> Vec { vec![ get_state_sync_stuck( "state_sync_stuck", - AlertEnvFiltering::MainnetStyleAlerts, Duration::from_secs(2 * SECS_IN_MIN), AlertSeverity::Regular, ), - get_state_sync_stuck( - "state_sync_stuck", - AlertEnvFiltering::TestnetStyleAlerts, - Duration::from_secs(2 * SECS_IN_MIN), - AlertSeverity::DayOnly, - ), get_state_sync_stuck( "state_sync_stuck_long_time", - AlertEnvFiltering::TestnetStyleAlerts, Duration::from_secs(30 * SECS_IN_MIN), AlertSeverity::Regular, ), diff --git a/crates/apollo_dashboard/src/alert_scenarios/tps.rs b/crates/apollo_dashboard/src/alert_scenarios/tps.rs index 258044c9230..d10ab087f3e 100644 --- a/crates/apollo_dashboard/src/alert_scenarios/tps.rs +++ b/crates/apollo_dashboard/src/alert_scenarios/tps.rs @@ -12,7 +12,6 @@ use crate::alerts::{ Alert, AlertComparisonOp, AlertCondition, - AlertEnvFiltering, AlertGroup, AlertLogicalOp, AlertSeverity, @@ -43,7 +42,6 @@ fn build_idle_alert( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -77,10 +75,7 @@ pub(crate) fn get_mempool_add_tx_idle() -> Alert { ) } -fn get_gateway_low_successful_transaction_rate( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_gateway_low_successful_transaction_rate(alert_severity: AlertSeverity) -> Alert { Alert::new( "gateway_low_successful_transaction_rate", "gateway low successful transaction rate", @@ -94,19 +89,9 @@ fn get_gateway_low_successful_transaction_rate( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_gateway_low_successful_transaction_rate_vec() -> Vec { - vec![ - get_gateway_low_successful_transaction_rate( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::DayOnly, - ), - get_gateway_low_successful_transaction_rate( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::WorkingHours, - ), - ] + vec![get_gateway_low_successful_transaction_rate(AlertSeverity::DayOnly)] } diff --git a/crates/apollo_dashboard/src/alert_scenarios/transaction_delays.rs b/crates/apollo_dashboard/src/alert_scenarios/transaction_delays.rs index d9b48f9d3f3..14a406889ff 100644 --- a/crates/apollo_dashboard/src/alert_scenarios/transaction_delays.rs +++ b/crates/apollo_dashboard/src/alert_scenarios/transaction_delays.rs @@ -10,7 +10,6 @@ use crate::alerts::{ Alert, AlertComparisonOp, AlertCondition, - AlertEnvFiltering, AlertGroup, AlertLogicalOp, AlertSeverity, @@ -21,10 +20,7 @@ use crate::alerts::{ // TODO(shahak): add gateway latency alert -fn get_mempool_p2p_peer_down( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_mempool_p2p_peer_down(alert_severity: AlertSeverity) -> Alert { Alert::new( "mempool_p2p_peer_down", "Mempool p2p peer down", @@ -40,26 +36,16 @@ fn get_mempool_p2p_peer_down( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_mempool_p2p_peer_down_vec() -> Vec { - vec![ - get_mempool_p2p_peer_down(AlertEnvFiltering::MainnetStyleAlerts, AlertSeverity::Regular), - get_mempool_p2p_peer_down( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::WorkingHours, - ), - ] + vec![get_mempool_p2p_peer_down(AlertSeverity::Regular)] } /// Triggers if the average latency of `add_tx` calls, across all HTTP servers, exceeds 15 seconds /// over a 5-minute window. -fn get_http_server_avg_add_tx_latency_alert( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_http_server_avg_add_tx_latency_alert(alert_severity: AlertSeverity) -> Alert { let sum_metric = HTTP_SERVER_ADD_TX_LATENCY.get_name_sum_with_filter(); let count_metric = HTTP_SERVER_ADD_TX_LATENCY.get_name_count_with_filter(); @@ -75,29 +61,16 @@ fn get_http_server_avg_add_tx_latency_alert( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_http_server_avg_add_tx_latency_alert_vec() -> Vec { - vec![ - get_http_server_avg_add_tx_latency_alert( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::DayOnly, - ), - get_http_server_avg_add_tx_latency_alert( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::WorkingHours, - ), - ] + vec![get_http_server_avg_add_tx_latency_alert(AlertSeverity::DayOnly)] } /// Triggers if the latency of all `add_tx` calls, across all HTTP servers, exceeds 1 second /// over a 2-minute window. -fn get_http_server_min_add_tx_latency_alert( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_http_server_min_add_tx_latency_alert(alert_severity: AlertSeverity) -> Alert { const TIME_WINDOW: &str = "2m"; let bucket_metric = HTTP_SERVER_ADD_TX_LATENCY.get_name_with_filer_and_additional_fields("le=\"1.0\""); @@ -120,29 +93,16 @@ fn get_http_server_min_add_tx_latency_alert( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_http_server_min_add_tx_latency_alert_vec() -> Vec { - vec![ - get_http_server_min_add_tx_latency_alert( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::Sos, - ), - get_http_server_min_add_tx_latency_alert( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::Regular, - ), - ] + vec![get_http_server_min_add_tx_latency_alert(AlertSeverity::Sos)] } /// Triggers when the slowest 5% of transactions for a specific HTTP server are taking longer than 2 /// seconds over a 5-minute window. -fn get_http_server_p95_add_tx_latency_alert( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_http_server_p95_add_tx_latency_alert(alert_severity: AlertSeverity) -> Alert { Alert::new( "http_server_p95_add_tx_latency", "High HTTP server P95 add_tx latency", @@ -156,28 +116,14 @@ fn get_http_server_p95_add_tx_latency_alert( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_http_server_p95_add_tx_latency_alert_vec() -> Vec { - vec![ - get_http_server_p95_add_tx_latency_alert( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::Informational, - ), - get_http_server_p95_add_tx_latency_alert( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::Informational, - ), - ] + vec![get_http_server_p95_add_tx_latency_alert(AlertSeverity::Informational)] } -fn get_high_empty_blocks_ratio_alert( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, - ratio: f64, -) -> Alert { +fn get_high_empty_blocks_ratio_alert(alert_severity: AlertSeverity, ratio: f64) -> Alert { const ALERT_NAME: &str = "high_empty_blocks_ratio"; // Our histogram buckets are static and the smallest bucket is 0.001. let lowest_histogram_bucket_value = HISTOGRAM_BUCKETS[0]; @@ -206,21 +152,9 @@ fn get_high_empty_blocks_ratio_alert( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_high_empty_blocks_ratio_alert_vec() -> Vec { - vec![ - get_high_empty_blocks_ratio_alert( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::Sos, - 0.3, - ), - get_high_empty_blocks_ratio_alert( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::Regular, - 0.6, - ), - ] + vec![get_high_empty_blocks_ratio_alert(AlertSeverity::Sos, 0.3)] } diff --git a/crates/apollo_dashboard/src/alert_scenarios/transaction_failures.rs b/crates/apollo_dashboard/src/alert_scenarios/transaction_failures.rs index cc89ba6e1af..4d1db5ac6f6 100644 --- a/crates/apollo_dashboard/src/alert_scenarios/transaction_failures.rs +++ b/crates/apollo_dashboard/src/alert_scenarios/transaction_failures.rs @@ -11,7 +11,6 @@ use crate::alerts::{ Alert, AlertComparisonOp, AlertCondition, - AlertEnvFiltering, AlertGroup, AlertLogicalOp, AlertSeverity, @@ -37,7 +36,6 @@ pub(crate) fn get_http_server_high_deprecated_transaction_failure_ratio() -> Ale EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::Informational, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } @@ -57,14 +55,10 @@ pub(crate) fn get_http_server_high_transaction_failure_ratio() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::Informational, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } -fn get_http_server_internal_error_ratio( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_http_server_internal_error_ratio(alert_severity: AlertSeverity) -> Alert { Alert::new( "http_server_internal_error_ratio", "http server internal error ratio", @@ -79,27 +73,14 @@ fn get_http_server_internal_error_ratio( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_http_server_internal_error_ratio_vec() -> Vec { - vec![ - get_http_server_internal_error_ratio( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::Regular, - ), - get_http_server_internal_error_ratio( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::WorkingHours, - ), - ] + vec![get_http_server_internal_error_ratio(AlertSeverity::Regular)] } -fn get_mempool_transaction_drop_ratio( - alert_env_filtering: AlertEnvFiltering, - alert_severity: AlertSeverity, -) -> Alert { +fn get_mempool_transaction_drop_ratio(alert_severity: AlertSeverity) -> Alert { Alert::new( "mempool_transaction_drop_ratio", "Mempool transaction drop ratio", @@ -119,21 +100,11 @@ fn get_mempool_transaction_drop_ratio( EVALUATION_INTERVAL_SEC_DEFAULT, alert_severity, ObserverApplicability::NotApplicable, - alert_env_filtering, ) } pub(crate) fn get_mempool_transaction_drop_ratio_vec() -> Vec { - vec![ - get_mempool_transaction_drop_ratio( - AlertEnvFiltering::MainnetStyleAlerts, - AlertSeverity::DayOnly, - ), - get_mempool_transaction_drop_ratio( - AlertEnvFiltering::TestnetStyleAlerts, - AlertSeverity::WorkingHours, - ), - ] + vec![get_mempool_transaction_drop_ratio(AlertSeverity::DayOnly)] } pub(crate) fn get_http_server_internal_error_once() -> Alert { @@ -150,6 +121,5 @@ pub(crate) fn get_http_server_internal_error_once() -> Alert { EVALUATION_INTERVAL_SEC_DEFAULT, AlertSeverity::WorkingHours, ObserverApplicability::NotApplicable, - AlertEnvFiltering::All, ) } diff --git a/crates/apollo_dashboard/src/alerts.rs b/crates/apollo_dashboard/src/alerts.rs index cc05e4ae82b..99173c4cc2b 100644 --- a/crates/apollo_dashboard/src/alerts.rs +++ b/crates/apollo_dashboard/src/alerts.rs @@ -1,9 +1,7 @@ use std::collections::HashSet; -use std::fmt; use serde::ser::SerializeStruct; use serde::{Serialize, Serializer}; -use strum_macros::EnumIter; use crate::alert_placeholders::{ ComparisonValueOrPlaceholder, @@ -22,23 +20,16 @@ pub struct Alerts { } impl Alerts { - pub(crate) fn new(alerts: Vec, alert_env_filtering: AlertEnvFiltering) -> Self { - let filtered_alerts: Vec = alerts - .into_iter() - .filter(|alert| alert.alert_env_filtering.matches(&alert_env_filtering)) - .collect(); - + pub(crate) fn new(alerts: Vec) -> Self { // Validate that there are no duplicate alert names. - filtered_alerts + alerts .iter() .map(|alert| alert.name.as_str()) .try_fold(HashSet::new(), |mut set, name| set.insert(name).then_some(set).ok_or(name)) - .unwrap_or_else(|duplicate| { - panic!("Duplicate alert name found: {duplicate} for env: {alert_env_filtering}") - }); + .unwrap_or_else(|duplicate| panic!("Duplicate alert name found: {duplicate}")); // Validate that there are no duplicate placeholder names across all alerts. - filtered_alerts + alerts .iter() .flat_map(|alert| alert.get_placeholder_names().iter()) .try_fold(HashSet::new(), |mut set, name| { @@ -48,33 +39,7 @@ impl Alerts { panic!("Duplicate placeholder name found across alerts: {duplicate}") }); - Self { alerts: filtered_alerts } - } -} - -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, EnumIter)] -pub enum AlertEnvFiltering { - All, - MainnetStyleAlerts, - TestnetStyleAlerts, -} - -impl AlertEnvFiltering { - pub fn matches(&self, target: &AlertEnvFiltering) -> bool { - self == target || *self == AlertEnvFiltering::All - } -} - -impl fmt::Display for AlertEnvFiltering { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let s = match self { - AlertEnvFiltering::All => { - unreachable!() - } // This variant is used for internal logic and should not be displayed. - AlertEnvFiltering::MainnetStyleAlerts => "mainnet", - AlertEnvFiltering::TestnetStyleAlerts => "testnet", - }; - write!(f, "{s}") + Self { alerts } } } @@ -226,8 +191,6 @@ pub(crate) struct Alert { // Indicates if relevant for observer nodes. observer_applicable: ObserverApplicability, #[serde(skip)] - alert_env_filtering: AlertEnvFiltering, - #[serde(skip)] placeholder_names: HashSet, } @@ -261,7 +224,6 @@ impl Alert { evaluation_interval_sec: u64, severity: impl Into, observer_applicable: ObserverApplicability, - alert_env_filtering: AlertEnvFiltering, ) -> Self { let severity = severity.into(); @@ -293,7 +255,6 @@ impl Alert { evaluation_interval_sec, severity, observer_applicable, - alert_env_filtering, placeholder_names, } } diff --git a/crates/apollo_dashboard/src/bin/sequencer_dashboard_generator.rs b/crates/apollo_dashboard/src/bin/sequencer_dashboard_generator.rs index d1c0b9d7a59..e586efc1ae0 100644 --- a/crates/apollo_dashboard/src/bin/sequencer_dashboard_generator.rs +++ b/crates/apollo_dashboard/src/bin/sequencer_dashboard_generator.rs @@ -1,11 +1,9 @@ use std::env; use apollo_dashboard::alert_definitions::{get_apollo_alerts, get_dev_alerts_json_path}; -use apollo_dashboard::alerts::AlertEnvFiltering; use apollo_dashboard::dashboard_definitions::{get_apollo_dashboard, DEV_JSON_PATH}; use apollo_infra_utils::dumping::serialize_to_file; use apollo_infra_utils::path::resolve_project_relative_path; -use strum::IntoEnumIterator; /// Creates the dashboard and alerts json files. fn main() { @@ -13,13 +11,5 @@ fn main() { .expect("Couldn't set working dir."); serialize_to_file(&get_apollo_dashboard(), DEV_JSON_PATH); - for alert_env_filtering in AlertEnvFiltering::iter() { - if alert_env_filtering == AlertEnvFiltering::All { - continue; // Skip the 'All' variant, as it used to cover all other options. - } - serialize_to_file( - &get_apollo_alerts(alert_env_filtering), - &get_dev_alerts_json_path(alert_env_filtering), - ); - } + serialize_to_file(&get_apollo_alerts(), &get_dev_alerts_json_path()); } diff --git a/crates/apollo_dashboard/src/dashboard_definitions_test.rs b/crates/apollo_dashboard/src/dashboard_definitions_test.rs index 8fec5ed45cd..5bf0123ce1b 100644 --- a/crates/apollo_dashboard/src/dashboard_definitions_test.rs +++ b/crates/apollo_dashboard/src/dashboard_definitions_test.rs @@ -1,8 +1,6 @@ use apollo_infra_utils::dumping::serialize_to_file_test; -use strum::IntoEnumIterator; use crate::alert_definitions::{get_apollo_alerts, get_dev_alerts_json_path}; -use crate::alerts::AlertEnvFiltering; use crate::dashboard_definitions::{get_apollo_dashboard, DEV_JSON_PATH}; const FIX_BINARY_NAME: &str = "sequencer_dashboard_generator"; @@ -12,14 +10,5 @@ const FIX_BINARY_NAME: &str = "sequencer_dashboard_generator"; #[test] fn default_dev_grafana_dashboard() { serialize_to_file_test(&get_apollo_dashboard(), DEV_JSON_PATH, FIX_BINARY_NAME); - for alert_env_filtering in AlertEnvFiltering::iter() { - if alert_env_filtering == AlertEnvFiltering::All { - continue; // Skip the 'All' variant, as it used to cover all other options. - } - serialize_to_file_test( - &get_apollo_alerts(alert_env_filtering), - &get_dev_alerts_json_path(alert_env_filtering), - FIX_BINARY_NAME, - ); - } + serialize_to_file_test(&get_apollo_alerts(), &get_dev_alerts_json_path(), FIX_BINARY_NAME); } diff --git a/crates/apollo_dashboard/src/dashboard_test.rs b/crates/apollo_dashboard/src/dashboard_test.rs index 72955465742..565156c9438 100644 --- a/crates/apollo_dashboard/src/dashboard_test.rs +++ b/crates/apollo_dashboard/src/dashboard_test.rs @@ -5,7 +5,6 @@ use crate::alerts::{ Alert, AlertComparisonOp, AlertCondition, - AlertEnvFiltering, AlertGroup, AlertLogicalOp, AlertSeverity, @@ -25,7 +24,6 @@ fn serialize_alert() { 20, AlertSeverity::Sos, ObserverApplicability::Applicable, - AlertEnvFiltering::All, ); let serialized = serde_json::to_value(&alert).unwrap(); diff --git a/deployments/local-testing/deploy.sh b/deployments/local-testing/deploy.sh index fd034d92361..478fc30cfe6 100755 --- a/deployments/local-testing/deploy.sh +++ b/deployments/local-testing/deploy.sh @@ -1511,7 +1511,7 @@ for rule in rules: upload_output=$("$venv_python" "${monitoring_dir}/src/main.py" \ --dev-dashboards-file "${SEQUENCER_ROOT_DIR}/crates/apollo_dashboard/resources/dev_grafana.json" \ - --dev-alerts-file "${SEQUENCER_ROOT_DIR}/crates/apollo_dashboard/resources/dev_grafana_alerts_testnet.json" \ + --dev-alerts-file "${SEQUENCER_ROOT_DIR}/crates/apollo_dashboard/resources/dev_grafana_alerts.json" \ --out-dir /tmp/grafana_builder \ --env dev \ --grafana-url "http://localhost:3000" \ @@ -1707,22 +1707,7 @@ else: # Upload Grafana alert rules only (for quick iteration on alerts) # Usage: upload_alerts [dev|testnet|mainnet] upload_alerts() { - local alert_env="${1:-dev}" # Default to dev - - # Validate environment - case "$alert_env" in - dev|testnet|mainnet) - ;; - *) - log_error "Invalid alert environment: $alert_env" - log_error "Valid options: dev, testnet, mainnet" - return 1 - ;; - esac - - # Use the base alerts file path - the Python script will resolve to the correct - # environment-specific file (_mainnet or _testnet) based on the --env parameter - # dev -> mainnet, testnet -> testnet, mainnet -> mainnet (see helpers.py alert_env_filename_suffix) + # Use the base alerts file path local alerts_file="${SEQUENCER_ROOT_DIR}/crates/apollo_dashboard/resources/dev_grafana_alerts.json" verify_k3d_cluster @@ -1834,7 +1819,7 @@ for rule in rules: fi # Upload alert rules only (no dashboards file) - # The Python script will resolve to dev_grafana_alerts_.json based on env + # The Python script will resolve to dev_grafana_alerts.json based on env # dev->mainnet, testnet->testnet, mainnet->mainnet local resolved_suffix case "$alert_env" in @@ -1842,7 +1827,7 @@ for rule in rules: testnet) resolved_suffix="testnet" ;; mainnet) resolved_suffix="mainnet" ;; esac - log_info "Uploading alert rules (env: ${alert_env} -> dev_grafana_alerts_${resolved_suffix}.json)" + log_info "Uploading alert rules (env: ${alert_env} -> dev_grafana_alerts.json)" local upload_output="" upload_output=$("$venv_python" "${monitoring_dir}/src/main.py" \ --dev-alerts-file "$alerts_file" \ diff --git a/deployments/monitoring/src/builders/alert_builder.py b/deployments/monitoring/src/builders/alert_builder.py index 0d2b4fa5f51..2251086934f 100755 --- a/deployments/monitoring/src/builders/alert_builder.py +++ b/deployments/monitoring/src/builders/alert_builder.py @@ -12,7 +12,6 @@ load_config_file, validate_config_overrides, ) -from common.env import EnvironmentName, alert_env_filename_suffix from common.grafana10_objects import ( alert_expression_model_object, alert_query_model_object, @@ -240,35 +239,23 @@ def post_process_alert(alert: dict[str, any]) -> dict[str, any]: return alert -# TODO(Tsabary): remove the vanilla path option once we transition to per-env file. -def resolve_dev_alerts_file_path(path: str, suffix: str) -> str: +def resolve_dev_alerts_file_path(path: str) -> str: """ Resolve a JSON path: - - If the original file exists, return it. - - Otherwise, check for `_.json`. + - If the file exists, return it. - Raise an error if neither exists. """ if os.path.isfile(path): return path - # Insert suffix before `.json` - base, ext = os.path.splitext(path) - if ext.lower() != ".json": - raise ValueError(f"Expected a .json file, got: {path}") - - alternative_path = f"{base}_{suffix}{ext}" - if os.path.isfile(alternative_path): - return alternative_path - - raise FileNotFoundError(f"Neither '{path}' nor '{alternative_path}' exists.") + raise FileNotFoundError(f"'{path}' does not exist.") def alert_builder(args: argparse.Namespace): global logger logger = get_logger(name="alert_builder", debug=args.debug) - suffix = alert_env_filename_suffix(env=EnvironmentName(args.env)) - alert_file_path = resolve_dev_alerts_file_path(path=args.dev_alerts_file, suffix=suffix) + alert_file_path = resolve_dev_alerts_file_path(path=args.dev_alerts_file) with open(alert_file_path, "r") as f: dev_alerts = json.load(f) @@ -340,7 +327,6 @@ def alert_builder(args: argparse.Namespace): datasource_uid=args.datasource_uid, labels={ "og_priority": dev_alert["severity"], - "environment": args.env, "observer_applicable": dev_alert["observer_applicable"], }, ) diff --git a/deployments/monitoring/src/builders/dashboard_builder.py b/deployments/monitoring/src/builders/dashboard_builder.py index 30999207199..b1699c11f34 100755 --- a/deployments/monitoring/src/builders/dashboard_builder.py +++ b/deployments/monitoring/src/builders/dashboard_builder.py @@ -6,7 +6,6 @@ from urllib.parse import quote import requests -from common.env import EnvironmentName from common.grafana10_objects import empty_dashboard, row_object, templating_object from common.logger import get_logger @@ -142,7 +141,7 @@ def dashboard_file_name(out_dir: str, dashboard_name: str) -> str: return f"{out_dir}/{file_name}.json" -def create_dashboard(dashboard_name: str, dev_dashboard: json, env: EnvironmentName) -> dict: +def create_dashboard(dashboard_name: str, dev_dashboard: json) -> dict: dashboard = empty_dashboard.copy() templating = templating_object.copy() panel_id = 1 @@ -217,7 +216,6 @@ def dashboard_builder(args: argparse.Namespace) -> None: create_dashboard( dashboard_name=dashboard_name, dev_dashboard=dev_json[dashboard_name], - env=EnvironmentName(args.env), ), ] ) diff --git a/deployments/monitoring/src/common/cli.py b/deployments/monitoring/src/common/cli.py index 59b10ef6b1e..9a904984b42 100644 --- a/deployments/monitoring/src/common/cli.py +++ b/deployments/monitoring/src/common/cli.py @@ -1,7 +1,5 @@ import argparse -from common.env import EnvironmentName - def arg_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Build And Upload Grafana Alerts") @@ -61,12 +59,6 @@ def arg_parser() -> argparse.ArgumentParser: required=True, help="Provide Kubernetes cluster to inject into alert expressions.", ) - parser.add_argument( - "--env", - type=str, - choices=[e.value for e in EnvironmentName], - required=True, - ) parser.add_argument( "--dashboard-overrides-config-file", type=str, diff --git a/deployments/monitoring/src/common/env.py b/deployments/monitoring/src/common/env.py index 23548d65c5e..4115d656e50 100644 --- a/deployments/monitoring/src/common/env.py +++ b/deployments/monitoring/src/common/env.py @@ -1,3 +1,5 @@ +# TODO(Tsabary): remove this entire module. + from enum import Enum @@ -8,7 +10,6 @@ class EnvironmentName(Enum): MAINNET = "mainnet" -# TODO(Tsabary): remove this enum and use the placeholder mechanism instead. # Translates the environment name to a suffix for alert filenames. We use the `mainnet` setting for development and the mainnet environment. # The `testnet` setting is used for integration and testnet environments. def alert_env_filename_suffix(env: EnvironmentName) -> str: