MON-93 - Add some monitors, and update names to respect new best practice

2018-02-21 15:24:25 +01:00 · 2018-02-21 15:24:25 +01:00 · a4b225f798
commit a4b225f798
parent 5b68882cfe
3 changed files with 141 additions and 83 deletions
--- a/cloud/aws/elb/README.md
+++ b/cloud/aws/elb/README.md
@ -20,8 +20,10 @@ Creates DataDog monitors with the following checks :

 * ELB no healthy hosts
 * ELB latency too high
-* ELB http code 5xx percent to high
 * ELB http code 4xx percent to high
+* ELB http code 5xx percent to high
+* ELB backend http code 4xx percent to high
+* ELB backend http code 5xx percent to high

 Inputs
 ------
@ -33,11 +35,14 @@ Inputs
 | elb_4xx_threshold_warning | loadbalancer 4xx warning threshold in percentage | string | `5` | no |
 | elb_5xx_threshold_critical | loadbalancer 5xx critical threshold in percentage | string | `10` | no |
 | elb_5xx_threshold_warning | loadbalancer 5xx warning threshold in percentage | string | `5` | no |
+| elb_backend_4xx_threshold_critical | loadbalancer backend 4xx critical threshold in percentage | string | `10` | no |
+| elb_backend_4xx_threshold_warning | loadbalancer backend 4xx warning threshold in percentage | string | `5` | no |
+| elb_backend_5xx_threshold_critical | loadbalancer backend 5xx critical threshold in percentage | string | `10` | no |
+| elb_backend_5xx_threshold_warning | loadbalancer backend 5xx warning threshold in percentage | string | `5` | no |
 | elb_backend_latency_critical | latency critical threshold in seconds | string | `5` | no |
 | elb_backend_latency_warning | latency warning threshold in seconds | string | `1` | no |
-| elb_notify_no_data | Use this variable to disable notify no data | string | `true` | no |
 | environment | Architecture Environment | string | - | yes |
 | evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no |
 | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
 | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
-| message | Message sent when an alert is triggered | string | - | yes |
+| message | Message sent when an alert is triggered | string | - | yes |
--- a/cloud/aws/elb/inputs.tf
+++ b/cloud/aws/elb/inputs.tf
@ -29,9 +29,14 @@ variable "dd_aws_elb" {
  default = "disable"
 }

-variable "elb_notify_no_data" {
-  description = "Use this variable to disable notify no data"
-  default     = true
+variable "elb_4xx_threshold_warning" {
+  description = "loadbalancer 4xx warning threshold in percentage"
+  default     = 5
+}
+
+variable "elb_4xx_threshold_critical" {
+  description = "loadbalancer 4xx critical threshold in percentage"
+  default     = 10
 }

 variable "elb_5xx_threshold_warning" {
@ -44,13 +49,23 @@ variable "elb_5xx_threshold_critical" {
  default     = 10
 }

-variable "elb_4xx_threshold_warning" {
-  description = "loadbalancer 4xx warning threshold in percentage"
+variable "elb_backend_4xx_threshold_warning" {
+  description = "loadbalancer backend 4xx warning threshold in percentage"
  default     = 5
 }

-variable "elb_4xx_threshold_critical" {
-  description = "loadbalancer 4xx critical threshold in percentage"
+variable "elb_backend_4xx_threshold_critical" {
+  description = "loadbalancer backend 4xx critical threshold in percentage"
+  default     = 10
+}
+
+variable "elb_backend_5xx_threshold_warning" {
+  description = "loadbalancer backend 5xx warning threshold in percentage"
+  default     = 5
+}
+
+variable "elb_backend_5xx_threshold_critical" {
+  description = "loadbalancer backend 5xx critical threshold in percentage"
  default     = 10
 }

--- a/cloud/aws/elb/monitors-elb.tf
+++ b/cloud/aws/elb/monitors-elb.tf
@ -7,97 +7,39 @@ data "template_file" "filter" {
 }

 resource "datadog_monitor" "ELB_no_healthy_instances" {
-  name    = "[${var.environment}] ELB no healthy instances on {{host.identifier}}"
+  name    = "[${var.environment}] ELB no healthy instances"
  message = "${var.message}"

  query = <<EOF
    avg(last_5m): (
-      avg:aws.elb.healthy_host_count{${data.template_file.filter.rendered}} by {loadbalancername,region}
-    ) == 0
+      avg:aws.elb.healthy_host_count{${data.template_file.filter.rendered}} by {region,loadbalancer}
+    ) < 1
  EOF

  type = "metric alert"

-  notify_no_data      = "${var.elb_notify_no_data}"
+  notify_no_data      = true
  evaluation_delay    = "${var.evaluation_delay}"
  renotify_interval   = 0
  notify_audit        = false
  timeout_h           = 0
  include_tags        = true
  locked              = false
-  require_full_window = true
+  require_full_window = false
  new_host_delay      = "${var.evaluation_delay}"
  no_data_timeframe   = 20

  tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"]
 }

-resource "datadog_monitor" "ELB_unhealthy_instances" {
-  name    = "[${var.environment}] ELB some unhealthy instances on {{host.identifier}}"
+resource "datadog_monitor" "ELB_too_much_4xx" {
+  name    = "[${var.environment}] ELB 4xx errors too high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
  message = "${var.message}"

  query = <<EOF
    avg(last_5m): (
-      avg:aws.elb.un_healthy_host_count{${data.template_file.filter.rendered}} by {loadbalancername,region}
-    ) > 0"
-  EOF
-
-  type = "metric alert"
-
-  notify_no_data      = "${var.elb_notify_no_data}"
-  evaluation_delay    = "${var.evaluation_delay}"
-  renotify_interval   = 0
-  notify_audit        = false
-  timeout_h           = 0
-  include_tags        = true
-  locked              = false
-  require_full_window = true
-  new_host_delay      = "${var.evaluation_delay}"
-  no_data_timeframe   = 20
-
-  tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"]
-}
-
-resource "datadog_monitor" "ELB_too_much_5xx_backend" {
-  name    = "[${var.environment}] ELB too much 5xx backend err on {{host.identifier}}"
-  message = "${var.message}"
-
-  query = <<EOF
-    avg(last_5m): (
-      avg:aws.elb.httpcode_backend_5xx{${data.template_file.filter.rendered}} by {loadbalancername,region} /
-      avg:aws.elb.request_count{${data.template_file.filter.rendered}} by {loadbalancername,region}
-    ) * 100 > ${var.elb_5xx_threshold_critical}"
-  EOF
-
-  type = "metric alert"
-
-  thresholds {
-    warning  = "${var.elb_5xx_threshold_warning}"
-    critical = "${var.elb_5xx_threshold_critical}"
-  }
-
-  notify_no_data      = "${var.elb_notify_no_data}"
-  evaluation_delay    = "${var.evaluation_delay}"
-  renotify_interval   = 0
-  notify_audit        = false
-  timeout_h           = 0
-  include_tags        = true
-  locked              = false
-  require_full_window = true
-  new_host_delay      = "${var.evaluation_delay}"
-  no_data_timeframe   = 20
-
-  tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"]
-}
-
-resource "datadog_monitor" "ELB_too_much_4xx_backend" {
-  name    = "[${var.environment}] ELB too much 4xx backend err on {{host.identifier}}"
-  message = "${var.message}"
-
-  query = <<EOF
-    avg(last_5m): (
-      avg:aws.elb.httpcode_backend_4xx{${data.template_file.filter.rendered}} by {loadbalancername,region} /
-      avg:aws.elb.request_count{${data.template_file.filter.rendered}} by {loadbalancername,region}
+      avg:aws.elb.httpcode_elb_4xx{${data.template_file.filter.rendered}} by {region,loadbalancer} /
+      avg:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}
    ) * 100 > ${var.elb_4xx_threshold_critical}"
  EOF

@ -108,14 +50,110 @@ resource "datadog_monitor" "ELB_too_much_4xx_backend" {
    critical = "${var.elb_4xx_threshold_critical}"
  }

-  notify_no_data      = "${var.elb_notify_no_data}"
+  notify_no_data      = true
  evaluation_delay    = "${var.evaluation_delay}"
  renotify_interval   = 0
  notify_audit        = false
  timeout_h           = 0
  include_tags        = true
  locked              = false
-  require_full_window = true
+  require_full_window = false
+  new_host_delay      = "${var.evaluation_delay}"
+  no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"]
+}
+
+resource "datadog_monitor" "ELB_too_much_5xx" {
+  name    = "[${var.environment}] ELB 5xx errors too high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
+  message = "${var.message}"
+
+  query = <<EOF
+    avg(last_5m): (
+      avg:aws.elb.httpcode_elb_5xx{${data.template_file.filter.rendered}} by {region,loadbalancer} /
+      avg:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}
+    ) * 100 > ${var.elb_5xx_threshold_critical}"
+  EOF
+
+  type = "metric alert"
+
+  thresholds {
+    warning  = "${var.elb_5xx_threshold_warning}"
+    critical = "${var.elb_5xx_threshold_critical}"
+  }
+
+  notify_no_data      = true
+  evaluation_delay    = "${var.evaluation_delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = false
+  new_host_delay      = "${var.evaluation_delay}"
+  no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"]
+}
+
+resource "datadog_monitor" "ELB_too_much_4xx_backend" {
+  name    = "[${var.environment}] ELB backend 4xx errors too high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
+  message = "${var.message}"
+
+  query = <<EOF
+    avg(last_5m): (
+      avg:aws.elb.httpcode_backend_4xx{${data.template_file.filter.rendered}} by {region,loadbalancer} /
+      avg:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}
+    ) * 100 > ${var.elb_backend_4xx_threshold_critical}"
+  EOF
+
+  type = "metric alert"
+
+  thresholds {
+    warning  = "${var.elb_backend_4xx_threshold_warning}"
+    critical = "${var.elb_backend_4xx_threshold_critical}"
+  }
+
+  notify_no_data      = true
+  evaluation_delay    = "${var.evaluation_delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = false
+  new_host_delay      = "${var.evaluation_delay}"
+  no_data_timeframe   = 20
+
+  tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"]
+}
+
+resource "datadog_monitor" "ELB_too_much_5xx_backend" {
+  name    = "[${var.environment}] ELB backend 5xx errors too high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
+  message = "${var.message}"
+
+  query = <<EOF
+    avg(last_5m): (
+      avg:aws.elb.httpcode_backend_5xx{${data.template_file.filter.rendered}} by {region,loadbalancer} /
+      avg:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}
+    ) * 100 > ${var.elb_backend_5xx_threshold_critical}"
+  EOF
+
+  type = "metric alert"
+
+  thresholds {
+    warning  = "${var.elb_backend_5xx_threshold_warning}"
+    critical = "${var.elb_backend_5xx_threshold_critical}"
+  }
+
+  notify_no_data      = true
+  evaluation_delay    = "${var.evaluation_delay}"
+  renotify_interval   = 0
+  notify_audit        = false
+  timeout_h           = 0
+  include_tags        = true
+  locked              = false
+  require_full_window = false
  new_host_delay      = "${var.evaluation_delay}"
  no_data_timeframe   = 20

@ -123,12 +161,12 @@ resource "datadog_monitor" "ELB_too_much_4xx_backend" {
 }

 resource "datadog_monitor" "ELB_backend_latency" {
-  name    = "[${var.environment}] ELB latency to high on  {{host.identifier}}"
+  name    = "[${var.environment}] ELB latency too high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
  message = "${var.message}"

  query = <<EOF
    avg(last_5m): (
-      avg:aws.elb.latency{${data.template_file.filter.rendered}} by {loadbalancername,region}
+      avg:aws.elb.latency{${data.template_file.filter.rendered}} by {region,loadbalancer}
    ) > ${var.elb_backend_latency_critical}}"
  EOF

@ -139,14 +177,14 @@ resource "datadog_monitor" "ELB_backend_latency" {
    critical = "${var.elb_backend_latency_critical}"
  }

-  notify_no_data      = "${var.elb_notify_no_data}"
+  notify_no_data      = true
  evaluation_delay    = "${var.evaluation_delay}"
  renotify_interval   = 0
  notify_audit        = false
  timeout_h           = 0
  include_tags        = true
  locked              = false
-  require_full_window = true
+  require_full_window = false
  new_host_delay      = "${var.evaluation_delay}"
  no_data_timeframe   = 20