-
Notifications
You must be signed in to change notification settings - Fork 511
[AWS] Introduce initial alert rule templates #15346
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 11 commits
db8282f
bbb5db6
94ffdb6
6ccc000
14419f7
74fd7dc
665711c
0b5ec5a
d28dd85
a0a49eb
84f231e
978d967
4cc98a2
1efc93b
6f3758b
84f98b5
5a38d36
9fefffa
ca349a8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| { | ||
| "id": "ec2-high-cpu-utilization", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "EC2 High CPU Utilization", | ||
| "tags": ["aws", "ec2", "cpu"], | ||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "1m" | ||
| }, | ||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 15, | ||
| "timeWindowUnit": "m", | ||
| "threshold": [ | ||
|
||
| 0 | ||
| ], | ||
| "thresholdComparator": ">", | ||
| "size": 100, | ||
| "esqlQuery": { | ||
| "esql": "FROM metrics-aws.ec2_metrics-default\n| STATS cpuutilization=avg(host.cpu.usage*100) by cloud.account.id, cloud.region, aws.dimensions.InstanceId\n| WHERE cpuutilization >= 80" | ||
| }, | ||
| "aggType": "count", | ||
| "groupBy": "all", | ||
|
||
| "termSize": 5, | ||
| "sourceFields": [], | ||
| "timeField": "event.ingested", | ||
|
||
| "excludeHitsFromPreviousRun": true | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| { | ||
| "id": "ec2-status-check-failed", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "EC2 Status Check Failed", | ||
| "tags": ["aws", "ec2", "status"], | ||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "1m" | ||
| }, | ||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 15, | ||
| "timeWindowUnit": "m", | ||
| "threshold": [ | ||
| 0 | ||
| ], | ||
| "thresholdComparator": ">", | ||
| "size": 100, | ||
| "esqlQuery": { | ||
| "esql": "FROM metrics-aws.ec2_metrics-default\n| STATS statusfailed=max(aws.ec2.metrics.StatusCheckFailed.avg) by cloud.account.id, cloud.region, aws.dimensions.InstanceId\n| WHERE statusfailed > 0" | ||
| }, | ||
| "aggType": "count", | ||
| "groupBy": "all", | ||
| "termSize": 5, | ||
| "sourceFields": [], | ||
| "timeField": "event.ingested", | ||
| "excludeHitsFromPreviousRun": true | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| { | ||
| "id": "lambda-errors", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "Lambda Errors", | ||
| "tags": ["aws", "lambda", "errors"], | ||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "1m" | ||
| }, | ||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 15, | ||
| "timeWindowUnit": "m", | ||
| "threshold": [ | ||
| 0 | ||
| ], | ||
| "thresholdComparator": ">", | ||
| "size": 100, | ||
| "esqlQuery": { | ||
| "esql": "FROM metrics-aws.lambda-default\n| STATS errors=sum(aws.lambda.Errors.avg) by cloud.account.id, cloud.region, aws.dimensions.FunctionName\n| WHERE errors > 0" | ||
| }, | ||
| "aggType": "count", | ||
| "groupBy": "all", | ||
| "termSize": 5, | ||
| "sourceFields": [], | ||
| "timeField": "event.ingested", | ||
| "excludeHitsFromPreviousRun": true | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| { | ||
| "id": "lambda-throttles", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "Lambda Throttles", | ||
| "tags": ["aws", "lambda", "throttles"], | ||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "1m" | ||
| }, | ||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 15, | ||
| "timeWindowUnit": "m", | ||
| "threshold": [ | ||
| 0 | ||
| ], | ||
| "thresholdComparator": ">", | ||
| "size": 100, | ||
| "esqlQuery": { | ||
| "esql": "FROM metrics-aws.lambda-default\n| STATS throttles=sum(aws.lambda.Throttles.avg) by cloud.account.id, cloud.region, aws.dimensions.FunctionName\n| WHERE throttles > 0" | ||
| }, | ||
| "aggType": "count", | ||
| "groupBy": "all", | ||
| "termSize": 5, | ||
| "sourceFields": [], | ||
| "timeField": "event.ingested", | ||
| "excludeHitsFromPreviousRun": true | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| { | ||
| "id": "sns-notifications-failed", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "SNS Notifications Failed", | ||
| "tags": ["aws", "sns", "notifications"], | ||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "1m" | ||
| }, | ||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 15, | ||
| "timeWindowUnit": "m", | ||
| "threshold": [ | ||
| 0 | ||
| ], | ||
| "thresholdComparator": ">", | ||
| "size": 100, | ||
| "esqlQuery": { | ||
| "esql": "FROM metrics-aws.sns-default\n| STATS notificationsfailed=avg(aws.sns.NumberOfNotificationsFailed.sum) by cloud.account.id, cloud.region, aws.dimensions.TopicName\n| WHERE notificationsfailed > 0" | ||
| }, | ||
| "aggType": "count", | ||
| "groupBy": "all", | ||
| "termSize": 5, | ||
| "sourceFields": [], | ||
| "timeField": "event.ingested", | ||
| "excludeHitsFromPreviousRun": true | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| { | ||
| "id": "sns-notifications-filtered-out", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "SNS Notifications Filtered Out", | ||
| "tags": ["aws", "sns", "notifications"], | ||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "1m" | ||
| }, | ||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 15, | ||
| "timeWindowUnit": "m", | ||
| "threshold": [ | ||
| 0 | ||
| ], | ||
| "thresholdComparator": ">", | ||
| "size": 100, | ||
| "esqlQuery": { | ||
| "esql": "FROM metrics-aws.sns-default\n| STATS notificationsfilteredout=avg(aws.sns.NumberOfNotificationsFilteredOut-InvalidAttributes.sum) by cloud.account.id, cloud.region, aws.dimensions.TopicName\n| WHERE notificationsfilteredout > 0" | ||
| }, | ||
| "aggType": "count", | ||
| "groupBy": "all", | ||
| "termSize": 5, | ||
| "sourceFields": [], | ||
| "timeField": "event.ingested", | ||
| "excludeHitsFromPreviousRun": true | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| { | ||
| "id": "sqs-messages-visible", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "SQS Messages Visible", | ||
| "tags": ["aws", "sqs", "messages visible"], | ||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "1m" | ||
| }, | ||
|
Comment on lines
8
to
10
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is applicable for all the configurations. Should we keep this so frequently? I suggest, this be equal to the default period value for metrics ingestion. Following so, it helps to avoid any no-data found alert (when user decides to extend the configuration)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we set
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think, thats a resonable thing to do. The impact I assume here will be that instead of an alert being notified at the period + 1m interval, the alert will be notified at 2 x period internal. Here period is 5m for most AWS servies. @tommyers-elastic , what would be your recommendation?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i don't think we have any way to couple configs in agent policy templates with these rule configurations, so whatever we choose will have to be always added by hand. my only thinking here is that it doesn't make sense to run a rule more frequently than the integration collection period. matching the rule frequency with the collection period seems sensible to me.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's a shame there's no way to put hints in the form such that we could have something that shows up and says "should match the integration collection period" or something. if we think it's worthwhile we could suggest this as a feature. |
||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 15, | ||
| "timeWindowUnit": "m", | ||
| "threshold": [ | ||
| 0 | ||
| ], | ||
| "thresholdComparator": ">", | ||
| "size": 100, | ||
| "esqlQuery": { | ||
| "esql": "FROM metrics-aws.sqs-default\n| STATS msgsvisible=max(aws.sqs.messages.visible) by cloud.account.id, cloud.region, aws.dimensions.QueueName\n| WHERE msgsvisible >= 1000" | ||
| }, | ||
| "aggType": "count", | ||
| "groupBy": "all", | ||
| "termSize": 5, | ||
| "sourceFields": [], | ||
| "timeField": "event.ingested", | ||
| "excludeHitsFromPreviousRun": true | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| { | ||
| "id": "sqs-oldest-message", | ||
| "type": "alerting_rule_template", | ||
| "attributes": { | ||
| "name": "SQS Oldest Message", | ||
| "tags": ["aws", "sqs", "oldest message"], | ||
| "ruleTypeId": ".es-query", | ||
| "schedule": { | ||
| "interval": "1m" | ||
| }, | ||
| "params": { | ||
| "searchType": "esqlQuery", | ||
| "timeWindowSize": 15, | ||
| "timeWindowUnit": "m", | ||
| "threshold": [ | ||
| 0 | ||
| ], | ||
| "thresholdComparator": ">", | ||
| "size": 100, | ||
| "esqlQuery": { | ||
| "esql": "FROM metrics-aws.sqs-default\n| STATS oldestmsgage=max(aws.sqs.oldest_message_age.sec) by cloud.account.id, cloud.region, aws.dimensions.QueueName\n| WHERE oldestmsgage >= 300" | ||
| }, | ||
| "aggType": "count", | ||
| "groupBy": "all", | ||
| "termSize": 5, | ||
| "sourceFields": [], | ||
| "timeField": "event.ingested", | ||
| "excludeHitsFromPreviousRun": true | ||
| }, | ||
| "alertDelay": { | ||
| "active": 1 | ||
| } | ||
| }, | ||
| "managed": true, | ||
| "coreMigrationVersion": "8.8.0", | ||
| "typeMigrationVersion": "10.1.0" | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Where do we declare which service (entity) this alert template applies to? Something like resource : aws.ec2
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have included the service name in the name of the alert rule template. I suppose Kibana should allow us to filter by tags or by partial matches on the title of the alert rule template.