From 3cb93f967c49137bfb0f31414851b3264d1dfb9a Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 26 Jan 2018 19:07:22 +0100 Subject: [PATCH 01/12] MON-112 add alb monitors --- cloud/aws/alb/README.md | 53 +++++++++++ cloud/aws/alb/inputs.tf | 78 +++++++++++++++ cloud/aws/alb/monitors-alb.tf | 173 ++++++++++++++++++++++++++++++++++ 3 files changed, 304 insertions(+) create mode 100644 cloud/aws/alb/README.md create mode 100644 cloud/aws/alb/inputs.tf create mode 100644 cloud/aws/alb/monitors-alb.tf diff --git a/cloud/aws/alb/README.md b/cloud/aws/alb/README.md new file mode 100644 index 0000000..2a4dfed --- /dev/null +++ b/cloud/aws/alb/README.md @@ -0,0 +1,53 @@ +AWS ALB DataDog monitors +========================================== + +How to use this module +---------------------- + +``` +module "datadog-monitors-aws-alb" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/aws/alb?ref={revision}" + + environment = "${var.environment}" + message = "${module.datadog-message-alerting.alerting-message}" +} + +``` + +Purpose +------- +Creates DataDog monitors with the following checks : + +* ALB no healthy hosts +* ALB latency too high +* ALB http code 5xx percent to high +* ALB http code 4xx percent to high +* ALB target http code 5xx percent to high +* ALB target http code 4xx percent to high + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| delay | Delay in seconds for the metric evaluation | string | `900` | no | +| environment | Architecture environment | string | - | yes | +| filter_tags | Tags used for custom filtering | string | `*` | no | +| httpcode_elb_4xx_threshold_critical | loadbalancer 4xx critical threshold in percentage | string | `80` | no | +| httpcode_elb_4xx_threshold_warning | loadbalancer 4xx warning threshold in percentage | string | `60` | no | +| httpcode_elb_5xx_threshold_critical | loadbalancer 5xx critical threshold in percentage | string | `80` | no | +| httpcode_elb_5xx_threshold_warning | loadbalancer 5xx warning threshold in percentage | string | `60` | no | +| httpcode_target_4xx_threshold_critical | target 4xx critical threshold in percentage | string | `80` | no | +| httpcode_target_4xx_threshold_warning | target 4xx warning threshold in percentage | string | `60` | no | +| httpcode_target_5xx_threshold_critical | target 5xx critical threshold in percentage | string | `80` | no | +| httpcode_target_5xx_threshold_warning | target 5xx warning threshold in percentage | string | `60` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| latency_threshold_critical | latency critical threshold in milliseconds | string | `1000` | no | +| latency_threshold_warning | latency warning threshold in milliseconds | string | `500` | no | + +Related documentation +--------------------- + +DataDog blog: [https://www.datadoghq.com/blog/monitor-application-load-balancer/](https://www.datadoghq.com/blog/monitor-application-load-balancer/) + +AWS ALB metrics documentation: [https://docs.aws.amazon.com/elasticloadbalancing/latest/application/load-balancer-cloudwatch-metrics.html](https://docs.aws.amazon.com/elasticloadbalancing/latest/application/load-balancer-cloudwatch-metrics.html) diff --git a/cloud/aws/alb/inputs.tf b/cloud/aws/alb/inputs.tf new file mode 100644 index 0000000..ce7e1b1 --- /dev/null +++ b/cloud/aws/alb/inputs.tf @@ -0,0 +1,78 @@ +# Datadog global variables + +variable "environment" { + description = "Architecture environment" + type = "string" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +# Datadog monitors variables + +variable "latency_threshold_critical" { + default = 1000 + description = "latency critical threshold in milliseconds" +} + +variable "latency_threshold_warning" { + default = 500 + description = "latency warning threshold in milliseconds" +} + +variable "httpcode_elb_4xx_threshold_critical" { + default = 80 + description = "loadbalancer 4xx critical threshold in percentage" +} + +variable "httpcode_elb_4xx_threshold_warning" { + default = 60 + description = "loadbalancer 4xx warning threshold in percentage" +} + +variable "httpcode_target_4xx_threshold_critical" { + default = 80 + description = "target 4xx critical threshold in percentage" +} + +variable "httpcode_target_4xx_threshold_warning" { + default = 60 + description = "target 4xx warning threshold in percentage" +} + +variable "httpcode_elb_5xx_threshold_critical" { + default = 80 + description = "loadbalancer 5xxcritical threshold in percentage" +} + +variable "httpcode_elb_5xx_threshold_warning" { + default = 60 + description = "loadbalancer 5xx warning threshold in percentage" +} + +variable "httpcode_target_5xx_threshold_critical" { + default = 80 + description = "target 5xx critical threshold in percentage" +} + +variable "httpcode_target_5xx_threshold_warning" { + default = 60 + description = "target 5xx warning threshold in percentage" +} + diff --git a/cloud/aws/alb/monitors-alb.tf b/cloud/aws/alb/monitors-alb.tf new file mode 100644 index 0000000..86e9f3a --- /dev/null +++ b/cloud/aws/alb/monitors-alb.tf @@ -0,0 +1,173 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_alb:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "ALB_no_healthy_instances" { + name = "[${var.environment}] ALB no healthy instances" + type = "metric alert" + message = "${var.message}" + query = < ${var.latency_threshold_critical} + EOF + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + thresholds { + critical = ${var.latency_threshold_critical} + warning = ${var.latency_threshold_warning} + } + + notify_no_data = true # Will notify when no data is received + renotify_interval = 0 + require_full_window = false + timeout_h = 0 + include_tags = true + + tags = ["env:${var.environment}", "resource:alb", "team:aws", "provider:aws"] +} + +resource "datadog_monitor" "ALB_httpcode_elb_5xx" { + name = "[${var.environment}] ALB HTTP code 5xx > ${var.httpcode_elb_5xx_threshold_critical} %" + type = "metric alert" + message = "${var.message}" + query = < ${var.httpcode_elb_5xx_threshold_critical} + EOF + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + thresholds { + critical = ${var.httpcode_elb_5xx_threshold_critical} + warning = ${var.httpcode_elb_5xx_threshold_warning} + } + + notify_no_data = false # Will notify when no data is received + renotify_interval = 0 + require_full_window = false + timeout_h = 1 + include_tags = true + + tags = ["env:${var.environment}", "resource:alb", "team:aws", "provider:aws"] +} + +resource "datadog_monitor" "ALB_httpcode_elb_4xx" { + name = "[${var.environment}] ALB HTTP code 4xx > ${var.httpcode_elb_4xx_threshold_critical} %" + type = "metric alert" + message = "${var.message}" + query = < ${var.httpcode_elb_4xx_threshold_critical} + EOF + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + thresholds { + critical = ${var.httpcode_elb_4xx_threshold_critical} + warning = ${var.httpcode_elb_4xx_threshold_warning} + } + + notify_no_data = false # Will notify when no data is received + renotify_interval = 0 + require_full_window = false + timeout_h = 1 + include_tags = true + + tags = ["env:${var.environment}", "resource:alb", "team:aws", "provider:aws"] +} + +resource "datadog_monitor" "ALB_httpcode_target_5xx" { + name = "[${var.environment}] ALB target HTTP code 5xx > ${var.httpcode_target_5xx_threshold_critical} %" + type = "metric alert" + message = "${var.message}" + query = < ${var.httpcode_target_5xx_threshold_critical} + EOF + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + thresholds { + critical = ${var.httpcode_target_5xx_threshold_critical} + warning = ${var.httpcode_target_5xx_threshold_warning} + } + + notify_no_data = false # Will notify when no data is received + renotify_interval = 0 + require_full_window = false + timeout_h = 1 + include_tags = true + + tags = ["env:${var.environment}", "resource:alb", "team:aws", "provider:aws"] +} + +resource "datadog_monitor" "ALB_httpcode_target_4xx" { + name = "[${var.environment}] ALB target HTTP code 4xx > ${var.httpcode_target_4xx_threshold_critical} %" + type = "metric alert" + message = "${var.message}" + query = < ${var.httpcode_target_4xx_threshold_critical} + EOF + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + thresholds { + critical = ${var.httpcode_target_4xx_threshold_critical} + warning = ${var.httpcode_target_4xx_threshold_warning} + } + + notify_no_data = false # Will notify when no data is received + renotify_interval = 0 + require_full_window = false + timeout_h = 1 + include_tags = true + + tags = ["env:${var.environment}", "resource:alb", "team:aws", "provider:aws"] +} + From 9229ad79a5f879a35593c20ad19f1fe7b6669e85 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 26 Jan 2018 19:12:22 +0100 Subject: [PATCH 02/12] MON-112 terraform fmt --- cloud/aws/alb/inputs.tf | 1 - cloud/aws/alb/monitors-alb.tf | 31 ++++++++++++++++++------------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/cloud/aws/alb/inputs.tf b/cloud/aws/alb/inputs.tf index ce7e1b1..4320ce2 100644 --- a/cloud/aws/alb/inputs.tf +++ b/cloud/aws/alb/inputs.tf @@ -75,4 +75,3 @@ variable "httpcode_target_5xx_threshold_warning" { default = 60 description = "target 5xx warning threshold in percentage" } - diff --git a/cloud/aws/alb/monitors-alb.tf b/cloud/aws/alb/monitors-alb.tf index 86e9f3a..745f97c 100644 --- a/cloud/aws/alb/monitors-alb.tf +++ b/cloud/aws/alb/monitors-alb.tf @@ -10,6 +10,7 @@ resource "datadog_monitor" "ALB_no_healthy_instances" { name = "[${var.environment}] ALB no healthy instances" type = "metric alert" message = "${var.message}" + query = < Date: Fri, 26 Jan 2018 19:17:06 +0100 Subject: [PATCH 03/12] MON-112 change for generic filter variable --- cloud/aws/alb/monitors-alb.tf | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cloud/aws/alb/monitors-alb.tf b/cloud/aws/alb/monitors-alb.tf index 745f97c..571b429 100644 --- a/cloud/aws/alb/monitors-alb.tf +++ b/cloud/aws/alb/monitors-alb.tf @@ -2,7 +2,7 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_alb:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + filter = "${data.template_file.filter.rendered_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_alb:enabled,env:%s", var.environment) : "${data.template_file.filter.rendered_custom}"}" } } @@ -13,7 +13,7 @@ resource "datadog_monitor" "ALB_no_healthy_instances" { query = < ${var.latency_threshold_critical} EOF @@ -68,8 +68,8 @@ resource "datadog_monitor" "ALB_httpcode_elb_5xx" { query = < ${var.httpcode_elb_5xx_threshold_critical} EOF @@ -97,8 +97,8 @@ resource "datadog_monitor" "ALB_httpcode_elb_4xx" { query = < ${var.httpcode_elb_4xx_threshold_critical} EOF @@ -126,8 +126,8 @@ resource "datadog_monitor" "ALB_httpcode_target_5xx" { query = < ${var.httpcode_target_5xx_threshold_critical} EOF @@ -155,8 +155,8 @@ resource "datadog_monitor" "ALB_httpcode_target_4xx" { query = < ${var.httpcode_target_4xx_threshold_critical} EOF From dc44c4523b2ae9fc066d3d854f7b40efaaa41788 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 26 Jan 2018 19:27:35 +0100 Subject: [PATCH 04/12] MON-112 update readme for filter variables --- cloud/aws/alb/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cloud/aws/alb/README.md b/cloud/aws/alb/README.md index 2a4dfed..e47d1ac 100644 --- a/cloud/aws/alb/README.md +++ b/cloud/aws/alb/README.md @@ -32,7 +32,8 @@ Inputs |------|-------------|:----:|:-----:|:-----:| | delay | Delay in seconds for the metric evaluation | string | `900` | no | | environment | Architecture environment | string | - | yes | -| filter_tags | Tags used for custom filtering | string | `*` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `*` | no | | httpcode_elb_4xx_threshold_critical | loadbalancer 4xx critical threshold in percentage | string | `80` | no | | httpcode_elb_4xx_threshold_warning | loadbalancer 4xx warning threshold in percentage | string | `60` | no | | httpcode_elb_5xx_threshold_critical | loadbalancer 5xx critical threshold in percentage | string | `80` | no | From 46bc47e802586e189e87557be8f61b7d637c49d1 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 26 Jan 2018 19:29:58 +0100 Subject: [PATCH 05/12] MON-112 update template file for filter --- cloud/aws/alb/monitors-alb.tf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cloud/aws/alb/monitors-alb.tf b/cloud/aws/alb/monitors-alb.tf index 571b429..623e935 100644 --- a/cloud/aws/alb/monitors-alb.tf +++ b/cloud/aws/alb/monitors-alb.tf @@ -2,7 +2,9 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${data.template_file.filter.rendered_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_alb:enabled,env:%s", var.environment) : "${data.template_file.filter.rendered_custom}"}" + filter = "${var.filter_tags_use_defaults == "true" ? + format("dd_monitoring:enabled,dd_aws_alb:enabled,env:%s", var.environment) : + "${var.filter_tags_custom}"}" } } From 73e3017c2f7b86a65055a73790c528b154c60913 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Wed, 31 Jan 2018 11:12:45 +0100 Subject: [PATCH 06/12] MON-112 remove useless comments --- cloud/aws/alb/monitors-alb.tf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cloud/aws/alb/monitors-alb.tf b/cloud/aws/alb/monitors-alb.tf index 623e935..ed75653 100644 --- a/cloud/aws/alb/monitors-alb.tf +++ b/cloud/aws/alb/monitors-alb.tf @@ -26,7 +26,7 @@ resource "datadog_monitor" "ALB_no_healthy_instances" { critical = 0 } - notify_no_data = true # Will notify when no data is received + notify_no_data = true renotify_interval = 0 require_full_window = false timeout_h = 0 @@ -54,7 +54,7 @@ resource "datadog_monitor" "ALB_latency" { warning = "${var.latency_threshold_warning}" } - notify_no_data = true # Will notify when no data is received + notify_no_data = true renotify_interval = 0 require_full_window = false timeout_h = 0 @@ -83,7 +83,7 @@ resource "datadog_monitor" "ALB_httpcode_elb_5xx" { warning = "${var.httpcode_elb_5xx_threshold_warning}" } - notify_no_data = false # Will notify when no data is received + notify_no_data = false renotify_interval = 0 require_full_window = false timeout_h = 1 @@ -112,7 +112,7 @@ resource "datadog_monitor" "ALB_httpcode_elb_4xx" { warning = "${var.httpcode_elb_4xx_threshold_warning}" } - notify_no_data = false # Will notify when no data is received + notify_no_data = false renotify_interval = 0 require_full_window = false timeout_h = 1 @@ -141,7 +141,7 @@ resource "datadog_monitor" "ALB_httpcode_target_5xx" { warning = "${var.httpcode_target_5xx_threshold_warning}" } - notify_no_data = false # Will notify when no data is received + notify_no_data = false renotify_interval = 0 require_full_window = false timeout_h = 1 @@ -170,7 +170,7 @@ resource "datadog_monitor" "ALB_httpcode_target_4xx" { warning = "${var.httpcode_target_4xx_threshold_warning}" } - notify_no_data = false # Will notify when no data is received + notify_no_data = false renotify_interval = 0 require_full_window = false timeout_h = 1 From 98d6194f0c63d6fe807c8166028a137eaad48747 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Thu, 22 Feb 2018 17:38:46 +0100 Subject: [PATCH 07/12] MON-112 - Updated with best practice and fix notify_no_data --- cloud/aws/alb/monitors-alb.tf | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cloud/aws/alb/monitors-alb.tf b/cloud/aws/alb/monitors-alb.tf index ed75653..8bc7de3 100644 --- a/cloud/aws/alb/monitors-alb.tf +++ b/cloud/aws/alb/monitors-alb.tf @@ -36,12 +36,12 @@ resource "datadog_monitor" "ALB_no_healthy_instances" { } resource "datadog_monitor" "ALB_latency" { - name = "[${var.environment}] ALB latency > ${var.latency_threshold_critical} ms" + name = "[${var.environment}] ALB latency {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" type = "metric alert" message = "${var.message}" query = < ${var.latency_threshold_critical} EOF @@ -54,7 +54,7 @@ resource "datadog_monitor" "ALB_latency" { warning = "${var.latency_threshold_warning}" } - notify_no_data = true + notify_no_data = false renotify_interval = 0 require_full_window = false timeout_h = 0 @@ -64,7 +64,7 @@ resource "datadog_monitor" "ALB_latency" { } resource "datadog_monitor" "ALB_httpcode_elb_5xx" { - name = "[${var.environment}] ALB HTTP code 5xx > ${var.httpcode_elb_5xx_threshold_critical} %" + name = "[${var.environment}] ALB HTTP code 5xx {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" type = "metric alert" message = "${var.message}" @@ -93,7 +93,7 @@ resource "datadog_monitor" "ALB_httpcode_elb_5xx" { } resource "datadog_monitor" "ALB_httpcode_elb_4xx" { - name = "[${var.environment}] ALB HTTP code 4xx > ${var.httpcode_elb_4xx_threshold_critical} %" + name = "[${var.environment}] ALB HTTP code 4xx {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" type = "metric alert" message = "${var.message}" @@ -122,7 +122,7 @@ resource "datadog_monitor" "ALB_httpcode_elb_4xx" { } resource "datadog_monitor" "ALB_httpcode_target_5xx" { - name = "[${var.environment}] ALB target HTTP code 5xx > ${var.httpcode_target_5xx_threshold_critical} %" + name = "[${var.environment}] ALB target HTTP code 5xx {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" type = "metric alert" message = "${var.message}" @@ -151,7 +151,7 @@ resource "datadog_monitor" "ALB_httpcode_target_5xx" { } resource "datadog_monitor" "ALB_httpcode_target_4xx" { - name = "[${var.environment}] ALB target HTTP code 4xx > ${var.httpcode_target_4xx_threshold_critical} %" + name = "[${var.environment}] ALB target HTTP code 4xx {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" type = "metric alert" message = "${var.message}" From ad6ad29d9ecb0952607ccebeb6f51e052058875f Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 20 Mar 2018 14:52:59 +0100 Subject: [PATCH 08/12] MON-96 - Update ALB monitors with inputs best practice --- cloud/aws/alb/README.md | 18 +++++++-- cloud/aws/alb/inputs.tf | 72 +++++++++++++++++++++++++++++++++++ cloud/aws/alb/monitors-alb.tf | 24 +++++++++--- 3 files changed, 105 insertions(+), 9 deletions(-) diff --git a/cloud/aws/alb/README.md b/cloud/aws/alb/README.md index e47d1ac..a6dcac0 100644 --- a/cloud/aws/alb/README.md +++ b/cloud/aws/alb/README.md @@ -30,21 +30,33 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| +| alb_no_healthy_instances_message | Custom message for ALB no healthy instances monitor | string | `` | no | +| alb_no_healthy_instances_silenced | Groups to mute for ALB no healthy instances monitor | map | `` | no | | delay | Delay in seconds for the metric evaluation | string | `900` | no | | environment | Architecture environment | string | - | yes | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | -| filter_tags_use_defaults | Use default filter tags convention | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| httpcode_elb_4xx_message | Custom message for ALB httpcode 4xx monitor | string | `` | no | +| httpcode_elb_4xx_silenced | Groups to mute for ALB httpcode 4xx monitor | map | `` | no | | httpcode_elb_4xx_threshold_critical | loadbalancer 4xx critical threshold in percentage | string | `80` | no | | httpcode_elb_4xx_threshold_warning | loadbalancer 4xx warning threshold in percentage | string | `60` | no | -| httpcode_elb_5xx_threshold_critical | loadbalancer 5xx critical threshold in percentage | string | `80` | no | +| httpcode_elb_5xx_message | Custom message for ALB httpcode 5xx monitor | string | `` | no | +| httpcode_elb_5xx_silenced | Groups to mute for ALB httpcode 5xx monitor | map | `` | no | +| httpcode_elb_5xx_threshold_critical | loadbalancer 5xxcritical threshold in percentage | string | `80` | no | | httpcode_elb_5xx_threshold_warning | loadbalancer 5xx warning threshold in percentage | string | `60` | no | +| httpcode_target_4xx_message | Custom message for ALB target httpcode 4xx monitor | string | `` | no | +| httpcode_target_4xx_silenced | Groups to mute for ALB target httpcode 4xx monitor | map | `` | no | | httpcode_target_4xx_threshold_critical | target 4xx critical threshold in percentage | string | `80` | no | | httpcode_target_4xx_threshold_warning | target 4xx warning threshold in percentage | string | `60` | no | +| httpcode_target_5xx_message | Custom message for ALB target httpcode 5xx monitor | string | `` | no | +| httpcode_target_5xx_silenced | Groups to mute for ALB target httpcode 5xx monitor | map | `` | no | | httpcode_target_5xx_threshold_critical | target 5xx critical threshold in percentage | string | `80` | no | | httpcode_target_5xx_threshold_warning | target 5xx warning threshold in percentage | string | `60` | no | -| message | Message sent when a monitor is triggered | string | - | yes | +| latency_message | Custom message for ALB latency monitor | string | `` | no | +| latency_silenced | Groups to mute for ALB latency monitor | map | `` | no | | latency_threshold_critical | latency critical threshold in milliseconds | string | `1000` | no | | latency_threshold_warning | latency warning threshold in milliseconds | string | `500` | no | +| message | Message sent when a monitor is triggered | string | - | yes | Related documentation --------------------- diff --git a/cloud/aws/alb/inputs.tf b/cloud/aws/alb/inputs.tf index 4320ce2..32a7278 100644 --- a/cloud/aws/alb/inputs.tf +++ b/cloud/aws/alb/inputs.tf @@ -26,6 +26,30 @@ variable "delay" { # Datadog monitors variables +variable "alb_no_healthy_instances_silenced" { + description = "Groups to mute for ALB no healthy instances monitor" + type = "map" + default = {} +} + +variable "alb_no_healthy_instances_message" { + description = "Custom message for ALB no healthy instances monitor" + type = "string" + default = "" +} + +variable "latency_silenced" { + description = "Groups to mute for ALB latency monitor" + type = "map" + default = {} +} + +variable "latency_message" { + description = "Custom message for ALB latency monitor" + type = "string" + default = "" +} + variable "latency_threshold_critical" { default = 1000 description = "latency critical threshold in milliseconds" @@ -36,6 +60,18 @@ variable "latency_threshold_warning" { description = "latency warning threshold in milliseconds" } +variable "httpcode_elb_4xx_silenced" { + description = "Groups to mute for ALB httpcode 4xx monitor" + type = "map" + default = {} +} + +variable "httpcode_elb_4xx_message" { + description = "Custom message for ALB httpcode 4xx monitor" + type = "string" + default = "" +} + variable "httpcode_elb_4xx_threshold_critical" { default = 80 description = "loadbalancer 4xx critical threshold in percentage" @@ -46,6 +82,18 @@ variable "httpcode_elb_4xx_threshold_warning" { description = "loadbalancer 4xx warning threshold in percentage" } +variable "httpcode_target_4xx_silenced" { + description = "Groups to mute for ALB target httpcode 4xx monitor" + type = "map" + default = {} +} + +variable "httpcode_target_4xx_message" { + description = "Custom message for ALB target httpcode 4xx monitor" + type = "string" + default = "" +} + variable "httpcode_target_4xx_threshold_critical" { default = 80 description = "target 4xx critical threshold in percentage" @@ -56,6 +104,18 @@ variable "httpcode_target_4xx_threshold_warning" { description = "target 4xx warning threshold in percentage" } +variable "httpcode_elb_5xx_silenced" { + description = "Groups to mute for ALB httpcode 5xx monitor" + type = "map" + default = {} +} + +variable "httpcode_elb_5xx_message" { + description = "Custom message for ALB httpcode 5xx monitor" + type = "string" + default = "" +} + variable "httpcode_elb_5xx_threshold_critical" { default = 80 description = "loadbalancer 5xxcritical threshold in percentage" @@ -66,6 +126,18 @@ variable "httpcode_elb_5xx_threshold_warning" { description = "loadbalancer 5xx warning threshold in percentage" } +variable "httpcode_target_5xx_silenced" { + description = "Groups to mute for ALB target httpcode 5xx monitor" + type = "map" + default = {} +} + +variable "httpcode_target_5xx_message" { + description = "Custom message for ALB target httpcode 5xx monitor" + type = "string" + default = "" +} + variable "httpcode_target_5xx_threshold_critical" { default = 80 description = "target 5xx critical threshold in percentage" diff --git a/cloud/aws/alb/monitors-alb.tf b/cloud/aws/alb/monitors-alb.tf index 8bc7de3..79479b8 100644 --- a/cloud/aws/alb/monitors-alb.tf +++ b/cloud/aws/alb/monitors-alb.tf @@ -11,7 +11,7 @@ data "template_file" "filter" { resource "datadog_monitor" "ALB_no_healthy_instances" { name = "[${var.environment}] ALB no healthy instances" type = "metric alert" - message = "${var.message}" + message = "${coalesce(var.alb_no_healthy_instances_message, var.message)}" query = < Date: Tue, 20 Mar 2018 17:47:41 +0100 Subject: [PATCH 09/12] MON-139 - Updated ALB monitors name with new best practice --- cloud/aws/alb/monitors-alb.tf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cloud/aws/alb/monitors-alb.tf b/cloud/aws/alb/monitors-alb.tf index 8bc7de3..89c3fd9 100644 --- a/cloud/aws/alb/monitors-alb.tf +++ b/cloud/aws/alb/monitors-alb.tf @@ -36,7 +36,7 @@ resource "datadog_monitor" "ALB_no_healthy_instances" { } resource "datadog_monitor" "ALB_latency" { - name = "[${var.environment}] ALB latency {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + name = "[${var.environment}] ALB latency {{#is_alert}}{{comparator}}{{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}}{{warn_threshold}}% ({{value}}%){{/is_warning}}" type = "metric alert" message = "${var.message}" @@ -64,7 +64,7 @@ resource "datadog_monitor" "ALB_latency" { } resource "datadog_monitor" "ALB_httpcode_elb_5xx" { - name = "[${var.environment}] ALB HTTP code 5xx {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + name = "[${var.environment}] ALB HTTP code 5xx {{#is_alert}}{{comparator}}{{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}}{{warn_threshold}}% ({{value}}%){{/is_warning}}" type = "metric alert" message = "${var.message}" @@ -93,7 +93,7 @@ resource "datadog_monitor" "ALB_httpcode_elb_5xx" { } resource "datadog_monitor" "ALB_httpcode_elb_4xx" { - name = "[${var.environment}] ALB HTTP code 4xx {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + name = "[${var.environment}] ALB HTTP code 4xx {{#is_alert}}{{comparator}}{{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}}{{warn_threshold}}% ({{value}}%){{/is_warning}}" type = "metric alert" message = "${var.message}" @@ -122,7 +122,7 @@ resource "datadog_monitor" "ALB_httpcode_elb_4xx" { } resource "datadog_monitor" "ALB_httpcode_target_5xx" { - name = "[${var.environment}] ALB target HTTP code 5xx {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + name = "[${var.environment}] ALB target HTTP code 5xx {{#is_alert}}{{comparator}}{{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}}{{warn_threshold}}% ({{value}}%){{/is_warning}}" type = "metric alert" message = "${var.message}" @@ -151,7 +151,7 @@ resource "datadog_monitor" "ALB_httpcode_target_5xx" { } resource "datadog_monitor" "ALB_httpcode_target_4xx" { - name = "[${var.environment}] ALB target HTTP code 4xx {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + name = "[${var.environment}] ALB target HTTP code 4xx {{#is_alert}}{{comparator}}{{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}}{{warn_threshold}}% ({{value}}%){{/is_warning}}" type = "metric alert" message = "${var.message}" From 7ea41564ba3a6f5dc4c0d03fd8fa56ed72facc21 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Wed, 21 Mar 2018 12:13:11 +0100 Subject: [PATCH 10/12] MON-140 - Added default function on ALB monitors --- cloud/aws/alb/monitors-alb.tf | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/cloud/aws/alb/monitors-alb.tf b/cloud/aws/alb/monitors-alb.tf index 79479b8..87a466a 100644 --- a/cloud/aws/alb/monitors-alb.tf +++ b/cloud/aws/alb/monitors-alb.tf @@ -74,8 +74,10 @@ resource "datadog_monitor" "ALB_httpcode_elb_5xx" { query = < ${var.httpcode_elb_5xx_threshold_critical} EOF @@ -90,7 +92,7 @@ resource "datadog_monitor" "ALB_httpcode_elb_5xx" { notify_no_data = false renotify_interval = 0 require_full_window = false - timeout_h = 1 + timeout_h = 0 include_tags = true silenced = "${var.httpcode_elb_5xx_silenced}" @@ -105,8 +107,10 @@ resource "datadog_monitor" "ALB_httpcode_elb_4xx" { query = < ${var.httpcode_elb_4xx_threshold_critical} EOF @@ -121,7 +125,7 @@ resource "datadog_monitor" "ALB_httpcode_elb_4xx" { notify_no_data = false renotify_interval = 0 require_full_window = false - timeout_h = 1 + timeout_h = 0 include_tags = true silenced = "${var.httpcode_elb_4xx_silenced}" @@ -136,8 +140,10 @@ resource "datadog_monitor" "ALB_httpcode_target_5xx" { query = < ${var.httpcode_target_5xx_threshold_critical} EOF @@ -152,7 +158,7 @@ resource "datadog_monitor" "ALB_httpcode_target_5xx" { notify_no_data = false renotify_interval = 0 require_full_window = false - timeout_h = 1 + timeout_h = 0 include_tags = true silenced = "${var.httpcode_target_5xx_silenced}" @@ -167,8 +173,10 @@ resource "datadog_monitor" "ALB_httpcode_target_4xx" { query = < ${var.httpcode_target_4xx_threshold_critical} EOF @@ -183,7 +191,7 @@ resource "datadog_monitor" "ALB_httpcode_target_4xx" { notify_no_data = false renotify_interval = 0 require_full_window = false - timeout_h = 1 + timeout_h = 0 include_tags = true silenced = "${var.httpcode_target_4xx_silenced}" From ca8ee7a9550503953ea34e23f805466ab5b225b1 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Thu, 22 Mar 2018 20:06:08 +0100 Subject: [PATCH 11/12] MON-139 fix missing space in templating name --- cloud/aws/alb/monitors-alb.tf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cloud/aws/alb/monitors-alb.tf b/cloud/aws/alb/monitors-alb.tf index 89c3fd9..4aa0f8b 100644 --- a/cloud/aws/alb/monitors-alb.tf +++ b/cloud/aws/alb/monitors-alb.tf @@ -36,7 +36,7 @@ resource "datadog_monitor" "ALB_no_healthy_instances" { } resource "datadog_monitor" "ALB_latency" { - name = "[${var.environment}] ALB latency {{#is_alert}}{{comparator}}{{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}}{{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "[${var.environment}] ALB latency {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" type = "metric alert" message = "${var.message}" @@ -64,7 +64,7 @@ resource "datadog_monitor" "ALB_latency" { } resource "datadog_monitor" "ALB_httpcode_elb_5xx" { - name = "[${var.environment}] ALB HTTP code 5xx {{#is_alert}}{{comparator}}{{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}}{{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "[${var.environment}] ALB HTTP code 5xx {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" type = "metric alert" message = "${var.message}" @@ -93,7 +93,7 @@ resource "datadog_monitor" "ALB_httpcode_elb_5xx" { } resource "datadog_monitor" "ALB_httpcode_elb_4xx" { - name = "[${var.environment}] ALB HTTP code 4xx {{#is_alert}}{{comparator}}{{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}}{{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "[${var.environment}] ALB HTTP code 4xx {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" type = "metric alert" message = "${var.message}" @@ -122,7 +122,7 @@ resource "datadog_monitor" "ALB_httpcode_elb_4xx" { } resource "datadog_monitor" "ALB_httpcode_target_5xx" { - name = "[${var.environment}] ALB target HTTP code 5xx {{#is_alert}}{{comparator}}{{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}}{{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "[${var.environment}] ALB target HTTP code 5xx {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" type = "metric alert" message = "${var.message}" @@ -151,7 +151,7 @@ resource "datadog_monitor" "ALB_httpcode_target_5xx" { } resource "datadog_monitor" "ALB_httpcode_target_4xx" { - name = "[${var.environment}] ALB target HTTP code 4xx {{#is_alert}}{{comparator}}{{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}}{{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "[${var.environment}] ALB target HTTP code 4xx {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" type = "metric alert" message = "${var.message}" From cc0e970dda93ef4d67823730bcc111be9b485b3d Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Thu, 22 Mar 2018 21:07:44 +0100 Subject: [PATCH 12/12] MON-112 add artificial request for false positive mitigation --- cloud/aws/alb/inputs.tf | 5 +++++ cloud/aws/alb/monitors-alb.tf | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cloud/aws/alb/inputs.tf b/cloud/aws/alb/inputs.tf index 32a7278..e6d71dd 100644 --- a/cloud/aws/alb/inputs.tf +++ b/cloud/aws/alb/inputs.tf @@ -147,3 +147,8 @@ variable "httpcode_target_5xx_threshold_warning" { default = 60 description = "target 5xx warning threshold in percentage" } + +variable "artificial_requests_count" { + default = 5 + description = "Number of false requests used to mitigate false positive in case of low trafic" +} diff --git a/cloud/aws/alb/monitors-alb.tf b/cloud/aws/alb/monitors-alb.tf index 6e7d823..e03c8d2 100644 --- a/cloud/aws/alb/monitors-alb.tf +++ b/cloud/aws/alb/monitors-alb.tf @@ -76,7 +76,7 @@ resource "datadog_monitor" "ALB_httpcode_elb_5xx" { sum(last_5m): ( default( avg:aws.applicationelb.httpcode_elb_5xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() / - avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count(), + (avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}), 0) * 100 ) > ${var.httpcode_elb_5xx_threshold_critical} EOF @@ -109,7 +109,7 @@ resource "datadog_monitor" "ALB_httpcode_elb_4xx" { sum(last_5m): ( default( avg:aws.applicationelb.httpcode_elb_4xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() / - avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count(), + (avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}), 0) * 100 ) > ${var.httpcode_elb_4xx_threshold_critical} EOF @@ -142,7 +142,7 @@ resource "datadog_monitor" "ALB_httpcode_target_5xx" { sum(last_5m): ( default( avg:aws.applicationelb.httpcode_target_5xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() / - avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count(), + (avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}), 0) * 100 ) > ${var.httpcode_target_5xx_threshold_critical} EOF @@ -175,7 +175,7 @@ resource "datadog_monitor" "ALB_httpcode_target_4xx" { sum(last_5m): ( default( avg:aws.applicationelb.httpcode_target_4xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() / - avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count(), + (avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}), 0) * 100 ) > ${var.httpcode_target_4xx_threshold_critical} EOF