Merged in MON-308-improve-load-balancer-healthly-h (pull request #163)

MON-308 improve unhealthy instances load balancer monitor Approved-by: Quentin Manfroi <quentin.manfroi@yahoo.fr> Approved-by: Jean-Maxime LEBLANC <jean-maxime.leblanc@fr.clara.net> Approved-by: Patrick Decat <patrick.decat@fr.clara.net> Approved-by: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
2018-09-27 08:21:40 +00:00 · 2018-09-27 08:21:40 +00:00 · 0c8d3cc016
commit 0c8d3cc016
parent 226e4077b7 99e7f57506
4 changed files with 18 additions and 8 deletions
--- a/cloud/aws/alb/README.md
+++ b/cloud/aws/alb/README.md
@ -16,10 +16,10 @@ module "datadog-monitors-cloud-aws-alb" {

 Creates DataDog monitors with the following checks:

+- ALB healthy instances
 - ALB HTTP code 4xx
 - ALB HTTP code 5xx
 - ALB latency
- ALB no healthy instances
 - ALB target HTTP code 4xx
 - ALB target HTTP code 5xx

--- a/cloud/aws/alb/monitors-alb.tf
+++ b/cloud/aws/alb/monitors-alb.tf
@ -1,13 +1,15 @@
 resource "datadog_monitor" "ALB_no_healthy_instances" {
  count   = "${var.alb_no_healthy_instances_enabled ? 1 : 0}"
-  name    = "[${var.environment}] ALB no healthy instances"
+  name    = "[${var.environment}] ALB healthy instances {{#is_alert}}at 0{{/is_alert}}{{#is_warning}}{{value}}at {{value}}%{{/is_warning}}"
  type    = "metric alert"
  message = "${coalesce(var.alb_no_healthy_instances_message, var.message)}"

  query = <<EOF
    ${var.alb_no_healthy_instances_time_aggregator}(${var.alb_no_healthy_instances_timeframe}): (
-      sum:aws.applicationelb.healthy_host_count${module.filter-tags.query_alert} by {region,loadbalancer}
-    ) < 1
+      sum:aws.applicationelb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} / (
+      sum:aws.applicationelb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} +
+      sum:aws.applicationelb.un_healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} )
+    ) * 100 < 1
  EOF

  evaluation_delay = "${var.evaluation_delay}"
@ -15,6 +17,7 @@ resource "datadog_monitor" "ALB_no_healthy_instances" {

  thresholds {
    critical = 1
+    warning  = 100
  }

  notify_no_data      = true
--- a/cloud/aws/elb/README.md
+++ b/cloud/aws/elb/README.md
@ -20,8 +20,8 @@ Creates DataDog monitors with the following checks:
 - ELB 5xx errors too high
 - ELB backend 4xx errors too high
 - ELB backend 5xx errors too high
+- ELB healthy instances
 - ELB latency too high
- ELB no healthy instances

 ## Inputs

--- a/cloud/aws/elb/monitors-elb.tf
+++ b/cloud/aws/elb/monitors-elb.tf
@ -1,16 +1,23 @@
 resource "datadog_monitor" "ELB_no_healthy_instances" {
  count   = "${var.elb_no_healthy_instance_enabled ? 1 : 0}"
-  name    = "[${var.environment}] ELB no healthy instances"
+  name    = "[${var.environment}] ELB healthy instances {{#is_alert}}at 0{{/is_alert}}{{#is_warning}}{{value}}at {{value}}%{{/is_warning}}"
  message = "${coalesce(var.elb_no_healthy_instance_message, var.message)}"

  query = <<EOF
    ${var.elb_no_healthy_instance_time_aggregator}(${var.elb_no_healthy_instance_timeframe}): (
-      sum:aws.elb.healthy_host_count${module.filter-tags.query_alert} by {region,loadbalancername}
-    ) < 1
+      sum:aws.elb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} / (
+      sum:aws.elb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} +
+      sum:aws.elb.un_healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} )
+    ) * 100 < 1
  EOF

  type = "metric alert"

+  thresholds {
+    critical = 1
+    warning  = 100
+  }
+
  notify_no_data      = true
  evaluation_delay    = "${var.evaluation_delay}"
  renotify_interval   = 0