Merged in MON-308-improve-load-balancer-healthly-h (pull request #163)

MON-308 improve unhealthy instances load balancer monitor

Approved-by: Quentin Manfroi <quentin.manfroi@yahoo.fr>
Approved-by: Jean-Maxime LEBLANC <jean-maxime.leblanc@fr.clara.net>
Approved-by: Patrick Decat <patrick.decat@fr.clara.net>
Approved-by: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
This commit is contained in:
Quentin Manfroi 2018-09-27 08:21:40 +00:00
commit 0c8d3cc016
4 changed files with 18 additions and 8 deletions

View File

@ -16,10 +16,10 @@ module "datadog-monitors-cloud-aws-alb" {
Creates DataDog monitors with the following checks: Creates DataDog monitors with the following checks:
- ALB healthy instances
- ALB HTTP code 4xx - ALB HTTP code 4xx
- ALB HTTP code 5xx - ALB HTTP code 5xx
- ALB latency - ALB latency
- ALB no healthy instances
- ALB target HTTP code 4xx - ALB target HTTP code 4xx
- ALB target HTTP code 5xx - ALB target HTTP code 5xx

View File

@ -1,13 +1,15 @@
resource "datadog_monitor" "ALB_no_healthy_instances" { resource "datadog_monitor" "ALB_no_healthy_instances" {
count = "${var.alb_no_healthy_instances_enabled ? 1 : 0}" count = "${var.alb_no_healthy_instances_enabled ? 1 : 0}"
name = "[${var.environment}] ALB no healthy instances" name = "[${var.environment}] ALB healthy instances {{#is_alert}}at 0{{/is_alert}}{{#is_warning}}{{value}}at {{value}}%{{/is_warning}}"
type = "metric alert" type = "metric alert"
message = "${coalesce(var.alb_no_healthy_instances_message, var.message)}" message = "${coalesce(var.alb_no_healthy_instances_message, var.message)}"
query = <<EOF query = <<EOF
${var.alb_no_healthy_instances_time_aggregator}(${var.alb_no_healthy_instances_timeframe}): ( ${var.alb_no_healthy_instances_time_aggregator}(${var.alb_no_healthy_instances_timeframe}): (
sum:aws.applicationelb.healthy_host_count${module.filter-tags.query_alert} by {region,loadbalancer} sum:aws.applicationelb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} / (
) < 1 sum:aws.applicationelb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} +
sum:aws.applicationelb.un_healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} )
) * 100 < 1
EOF EOF
evaluation_delay = "${var.evaluation_delay}" evaluation_delay = "${var.evaluation_delay}"
@ -15,6 +17,7 @@ resource "datadog_monitor" "ALB_no_healthy_instances" {
thresholds { thresholds {
critical = 1 critical = 1
warning = 100
} }
notify_no_data = true notify_no_data = true

View File

@ -20,8 +20,8 @@ Creates DataDog monitors with the following checks:
- ELB 5xx errors too high - ELB 5xx errors too high
- ELB backend 4xx errors too high - ELB backend 4xx errors too high
- ELB backend 5xx errors too high - ELB backend 5xx errors too high
- ELB healthy instances
- ELB latency too high - ELB latency too high
- ELB no healthy instances
## Inputs ## Inputs

View File

@ -1,16 +1,23 @@
resource "datadog_monitor" "ELB_no_healthy_instances" { resource "datadog_monitor" "ELB_no_healthy_instances" {
count = "${var.elb_no_healthy_instance_enabled ? 1 : 0}" count = "${var.elb_no_healthy_instance_enabled ? 1 : 0}"
name = "[${var.environment}] ELB no healthy instances" name = "[${var.environment}] ELB healthy instances {{#is_alert}}at 0{{/is_alert}}{{#is_warning}}{{value}}at {{value}}%{{/is_warning}}"
message = "${coalesce(var.elb_no_healthy_instance_message, var.message)}" message = "${coalesce(var.elb_no_healthy_instance_message, var.message)}"
query = <<EOF query = <<EOF
${var.elb_no_healthy_instance_time_aggregator}(${var.elb_no_healthy_instance_timeframe}): ( ${var.elb_no_healthy_instance_time_aggregator}(${var.elb_no_healthy_instance_timeframe}): (
sum:aws.elb.healthy_host_count${module.filter-tags.query_alert} by {region,loadbalancername} sum:aws.elb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} / (
) < 1 sum:aws.elb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} +
sum:aws.elb.un_healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} )
) * 100 < 1
EOF EOF
type = "metric alert" type = "metric alert"
thresholds {
critical = 1
warning = 100
}
notify_no_data = true notify_no_data = true
evaluation_delay = "${var.evaluation_delay}" evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0