From 115b8e0bf3c9556acf690c5a0ee5600caa74d95c Mon Sep 17 00:00:00 2001 From: "gauthier.ampe@fr.clara.net" Date: Fri, 20 Sep 2019 16:44:12 +0200 Subject: [PATCH 1/9] MON-366 Add app-gateway monitors --- README.md | 1 + cloud/azure/app-gateway/README.md | 87 ++++++ cloud/azure/app-gateway/inputs.tf | 255 ++++++++++++++++++ cloud/azure/app-gateway/modules.tf | 31 +++ .../app-gateway/monitors-app_services.tf | 190 +++++++++++++ cloud/azure/app-gateway/outputs.tf | 30 +++ 6 files changed, 594 insertions(+) create mode 100644 cloud/azure/app-gateway/README.md create mode 100644 cloud/azure/app-gateway/inputs.tf create mode 100644 cloud/azure/app-gateway/modules.tf create mode 100644 cloud/azure/app-gateway/monitors-app_services.tf create mode 100644 cloud/azure/app-gateway/outputs.tf diff --git a/README.md b/README.md index 964c4a2..e7bfce6 100644 --- a/README.md +++ b/README.md @@ -161,6 +161,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi - [vpn](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/vpn/) - [azure](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/azure/) - [apimanagement](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/azure/apimanagement/) + - [app-gateway](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/azure/app-gateway/) - [app-services](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/azure/app-services/) - [azure-search](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/azure/azure-search/) - [cosmosdb](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/azure/cosmosdb/) diff --git a/cloud/azure/app-gateway/README.md b/cloud/azure/app-gateway/README.md new file mode 100644 index 0000000..2333b2d --- /dev/null +++ b/cloud/azure/app-gateway/README.md @@ -0,0 +1,87 @@ +# CLOUD AZURE APP-GATEWAY DataDog monitors + +## How to use this module + +``` +module "datadog-monitors-cloud-azure-app-gateway" { + source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/azure/app-gateway?ref={revision}" + + environment = var.environment + message = module.datadog-message-alerting.alerting-message +} + +``` + +## Purpose + +Creates DataDog monitors with the following checks: + +- App Gateway failed requests (disabled by default) +- App Gateway HTTP 4xx errors too high +- App Gateway HTTP 5xx errors too high +- App Gateway is down +- App Gateway no connection (disabled by default) +- App Gateway no healthy host + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| appgateway\_failed\_requests\_enabled | Flag to enable App Gateway failed requests monitor | string | `"false"` | no | +| appgateway\_failed\_requests\_extra\_tags | Extra tags for App Gateway failed requests monitor | list(string) | `[]` | no | +| appgateway\_failed\_requests\_message | Custom message for App Gateway failed requests monitor | string | `""` | no | +| appgateway\_failed\_requests\_threshold\_critical | Maximum critical acceptable percent of failed errors | string | `"100"` | no | +| appgateway\_failed\_requests\_threshold\_warning | Warning regarding acceptable percent of failed errors | string | `"200"` | no | +| appgateway\_failed\_requests\_time\_aggregator | Monitor aggregator for App Gateway failed requests [available values: min, max or avg] | string | `"min"` | no | +| appgateway\_failed\_requests\_timeframe | Monitor timeframe for App Gateway failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| appgateway\_healthy\_host\_count\_enabled | Flag to enable App Gateway healthy host monitor | string | `"true"` | no | +| appgateway\_healthy\_host\_count\_extra\_tags | Extra tags for App Gateway healthy host monitor | list(string) | `[]` | no | +| appgateway\_healthy\_host\_count\_message | Custom message for App Gateway healthy host monitor | string | `""` | no | +| appgateway\_healthy\_host\_count\_time\_aggregator | Monitor aggregator for App Gateway healthy host [available values: min, max or avg] | string | `"min"` | no | +| appgateway\_healthy\_host\_count\_timeframe | Monitor timeframe for App Gateway healthy host [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| appgateway\_http\_4xx\_errors\_enabled | Flag to enable App Gateway http 4xx errors monitor | string | `"true"` | no | +| appgateway\_http\_4xx\_errors\_extra\_tags | Extra tags for App Gateway http 4xx errors monitor | list(string) | `[]` | no | +| appgateway\_http\_4xx\_errors\_message | Custom message for App Gateway http 4xx errors monitor | string | `""` | no | +| appgateway\_http\_4xx\_errors\_threshold\_critical | Minimum critical acceptable percent of 4xx error | string | `"30"` | no | +| appgateway\_http\_4xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 4xx error | string | `"20"` | no | +| appgateway\_http\_4xx\_errors\_time\_aggregator | Monitor aggregator for App Gateway http 4xx errors [available values: min, max or avg] | string | `"max"` | no | +| appgateway\_http\_4xx\_errors\_timeframe | Monitor timeframe for App Gateway http 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| appgateway\_http\_5xx\_errors\_enabled | Flag to enable App Gateway http 5xx errors monitor | string | `"true"` | no | +| appgateway\_http\_5xx\_errors\_extra\_tags | Extra tags for App Gateway http 5xx errors monitor | list(string) | `[]` | no | +| appgateway\_http\_5xx\_errors\_message | Custom message for App Gateway http 5xx errors monitor | string | `""` | no | +| appgateway\_http\_5xx\_errors\_threshold\_critical | Minimum critical acceptable percent of 5xx error | string | `"30"` | no | +| appgateway\_http\_5xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 5xx error | string | `"20"` | no | +| appgateway\_http\_5xx\_errors\_time\_aggregator | Monitor aggregator for App Gateway http 5xx errors [available values: min, max or avg] | string | `"max"` | no | +| appgateway\_http\_5xx\_errors\_timeframe | Monitor timeframe for App Gateway http 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| current\_connection\_enabled | Flag to enable App Gateway current connections monitor | string | `"false"` | no | +| current\_connection\_extra\_tags | Extra tags for App Gateway current connections monitor | list(string) | `[]` | no | +| current\_connection\_message | Custom message for App Gateway current connections monitor | string | `""` | no | +| current\_connection\_time\_aggregator | Monitor aggregator for App Gateway current connections [available values: min, max or avg] | string | `"min"` | no | +| current\_connection\_timeframe | Monitor timeframe for App Gateway current connections [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| environment | Architecture environment | string | n/a | yes | +| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no | +| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no | +| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | +| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | +| message | Message sent when a monitor is triggered | string | n/a | yes | +| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | +| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no | +| status\_enabled | Flag to enable App Gateway status | string | `"true"` | no | +| status\_extra\_tags | Extra tags for App Gateway status | list(string) | `[]` | no | +| status\_message | Custom message for App Gateway status | string | `""` | no | +| status\_time\_aggregator | Monitor aggregator for App Gateway status [available values: min, max or avg] | string | `"min"` | no | +| status\_timeframe | Monitor timeframe for App Gateway status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| appgateway\_failed\_requests\_id | id for monitor appgateway_failed_requests | +| appgateway\_healthy\_host\_count\_id | id for monitor appgateway_healthy_host_count | +| appgateway\_http\_4xx\_errors\_id | id for monitor appgateway_http_4xx_errors | +| appgateway\_http\_5xx\_errors\_id | id for monitor appgateway_http_5xx_errors | +| appgateway\_status\_id | id for monitor appgateway_status | +| current\_connection\_id | id for monitor current_connection | + +## Related documentation + diff --git a/cloud/azure/app-gateway/inputs.tf b/cloud/azure/app-gateway/inputs.tf new file mode 100644 index 0000000..8cdb5a4 --- /dev/null +++ b/cloud/azure/app-gateway/inputs.tf @@ -0,0 +1,255 @@ +variable "environment" { + description = "Architecture environment" + type = string +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "filter_tags_custom_excluded" { + description = "Tags excluded for custom filtering when filter_tags_use_defaults is false" + default = "" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +variable "new_host_delay" { + description = "Delay in seconds before monitor new resource" + default = 300 +} + +variable "prefix_slug" { + description = "Prefix string to prepend between brackets on every monitors names" + default = "" +} + +# Azure App Gateway specific variables +# Monitoring App Gateway status +variable "status_enabled" { + description = "Flag to enable App Gateway status" + type = string + default = "true" +} + +variable "status_extra_tags" { + description = "Extra tags for App Gateway status" + type = list(string) + default = [] +} + +variable "status_message" { + description = "Custom message for App Gateway status" + type = string + default = "" +} + +variable "status_time_aggregator" { + description = "Monitor aggregator for App Gateway status [available values: min, max or avg]" + type = string + default = "min" +} + +variable "status_timeframe" { + description = "Monitor timeframe for App Gateway status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +# Monitoring App Gateway current_connections (count) +variable "current_connection_enabled" { + description = "Flag to enable App Gateway current connections monitor" + type = string + default = "false" +} + +variable "current_connection_extra_tags" { + description = "Extra tags for App Gateway current connections monitor" + type = list(string) + default = [] +} + +variable "current_connection_message" { + description = "Custom message for App Gateway current connections monitor" + type = string + default = "" +} + +variable "current_connection_time_aggregator" { + description = "Monitor aggregator for App Gateway current connections [available values: min, max or avg]" + type = string + default = "min" +} + +variable "current_connection_timeframe" { + description = "Monitor timeframe for App Gateway current connections [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +# Monitoring App Gateway failed_requests (count) +variable "appgateway_failed_requests_enabled" { + description = "Flag to enable App Gateway failed requests monitor" + type = string + default = "false" +} + +variable "appgateway_failed_requests_extra_tags" { + description = "Extra tags for App Gateway failed requests monitor" + type = list(string) + default = [] +} + +variable "appgateway_failed_requests_message" { + description = "Custom message for App Gateway failed requests monitor" + type = string + default = "" +} + +variable "appgateway_failed_requests_time_aggregator" { + description = "Monitor aggregator for App Gateway failed requests [available values: min, max or avg]" + type = string + default = "min" +} + +variable "appgateway_failed_requests_timeframe" { + description = "Monitor timeframe for App Gateway failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +variable "appgateway_failed_requests_threshold_critical" { + default = 100 + description = "Maximum critical acceptable percent of failed errors" +} + +variable "appgateway_failed_requests_threshold_warning" { + default = 200 + description = "Warning regarding acceptable percent of failed errors" +} + +# Monitoring App Gateway healthy_host_count (count) +variable "appgateway_healthy_host_count_enabled" { + description = "Flag to enable App Gateway healthy host monitor" + type = string + default = "true" +} + +variable "appgateway_healthy_host_count_extra_tags" { + description = "Extra tags for App Gateway healthy host monitor" + type = list(string) + default = [] +} + +variable "appgateway_healthy_host_count_message" { + description = "Custom message for App Gateway healthy host monitor" + type = string + default = "" +} + +variable "appgateway_healthy_host_count_time_aggregator" { + description = "Monitor aggregator for App Gateway healthy host [available values: min, max or avg]" + type = string + default = "min" +} + +variable "appgateway_healthy_host_count_timeframe" { + description = "Monitor timeframe for App Gateway healthy host [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +# Monitoring App Gateway response_status 4xx (count) +variable "appgateway_http_4xx_errors_enabled" { + description = "Flag to enable App Gateway http 4xx errors monitor" + type = string + default = "true" +} + +variable "appgateway_http_4xx_errors_extra_tags" { + description = "Extra tags for App Gateway http 4xx errors monitor" + type = list(string) + default = [] +} + +variable "appgateway_http_4xx_errors_message" { + description = "Custom message for App Gateway http 4xx errors monitor" + type = string + default = "" +} + +variable "appgateway_http_4xx_errors_time_aggregator" { + description = "Monitor aggregator for App Gateway http 4xx errors [available values: min, max or avg]" + type = string + default = "max" +} + +variable "appgateway_http_4xx_errors_timeframe" { + description = "Monitor timeframe for App Gateway http 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +variable "appgateway_http_4xx_errors_threshold_critical" { + default = 30 + description = "Minimum critical acceptable percent of 4xx error" +} + +variable "appgateway_http_4xx_errors_threshold_warning" { + default = 20 + description = "Warning regarding acceptable percent of 4xx error" +} + +# Monitoring App Gateway response_status 5xx (count) +variable "appgateway_http_5xx_errors_enabled" { + description = "Flag to enable App Gateway http 5xx errors monitor" + type = string + default = "true" +} + +variable "appgateway_http_5xx_errors_extra_tags" { + description = "Extra tags for App Gateway http 5xx errors monitor" + type = list(string) + default = [] +} + +variable "appgateway_http_5xx_errors_message" { + description = "Custom message for App Gateway http 5xx errors monitor" + type = string + default = "" +} + +variable "appgateway_http_5xx_errors_time_aggregator" { + description = "Monitor aggregator for App Gateway http 5xx errors [available values: min, max or avg]" + type = string + default = "max" +} + +variable "appgateway_http_5xx_errors_timeframe" { + description = "Monitor timeframe for App Gateway http 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +variable "appgateway_http_5xx_errors_threshold_critical" { + default = 30 + description = "Minimum critical acceptable percent of 5xx error" +} + +variable "appgateway_http_5xx_errors_threshold_warning" { + default = 20 + description = "Warning regarding acceptable percent of 5xx error" +} diff --git a/cloud/azure/app-gateway/modules.tf b/cloud/azure/app-gateway/modules.tf new file mode 100644 index 0000000..f2595dc --- /dev/null +++ b/cloud/azure/app-gateway/modules.tf @@ -0,0 +1,31 @@ +module "filter-tags" { + source = "../../../common/filter-tags" + + environment = var.environment + resource = "azure_app-gateway" + filter_tags_use_defaults = var.filter_tags_use_defaults + filter_tags_custom = var.filter_tags_custom + filter_tags_custom_excluded = var.filter_tags_custom_excluded +} + +module "filter-tags-4xx-error" { + source = "../../../common/filter-tags" + + environment = var.environment + resource = "azure_app-gateway" + filter_tags_use_defaults = var.filter_tags_use_defaults + filter_tags_custom = var.filter_tags_custom + filter_tags_custom_excluded = var.filter_tags_custom_excluded + extra_tags = ["httpstatus:5xx"] +} + +module "filter-tags-5xx-error" { + source = "../../../common/filter-tags" + + environment = var.environment + resource = "azure_app-gateway" + filter_tags_use_defaults = var.filter_tags_use_defaults + filter_tags_custom = var.filter_tags_custom + filter_tags_custom_excluded = var.filter_tags_custom_excluded + extra_tags = ["httpstatus:5xx"] +} diff --git a/cloud/azure/app-gateway/monitors-app_services.tf b/cloud/azure/app-gateway/monitors-app_services.tf new file mode 100644 index 0000000..bdfb76b --- /dev/null +++ b/cloud/azure/app-gateway/monitors-app_services.tf @@ -0,0 +1,190 @@ +# Monitoring App Gateway status +resource "datadog_monitor" "appgateway_status" { + count = var.status_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway is down" + message = coalesce(var.status_message, var.message) + type = "query alert" + + query = < ${var.appgateway_failed_requests_threshold_critical} +EOQ + + thresholds = { + critical = var.appgateway_failed_requests_threshold_critical + warning = var.appgateway_failed_requests_threshold_warning + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + notify_no_data = false + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = false + + tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-gateway", "team:claranet", "created-by:terraform"], var.appgateway_failed_requests_extra_tags) + + lifecycle { + ignore_changes = ["silenced"] + } +} + +# Monitoring App Gateway healthy_host_count (count) +resource "datadog_monitor" "appgateway_healthy_host_count" { + count = var.appgateway_healthy_host_count_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway no healthy host {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = coalesce(var.appgateway_healthy_host_count_message, var.message) + type = "query alert" + + query = < ${var.appgateway_http_4xx_errors_threshold_critical} +EOQ + + + thresholds = { + warning = var.appgateway_http_4xx_errors_threshold_warning + critical = var.appgateway_http_4xx_errors_threshold_critical + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + notify_no_data = false + renotify_interval = 0 + require_full_window = false + timeout_h = 1 + include_tags = true + + tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-gateway", "team:claranet", "created-by:terraform"], var.appgateway_http_4xx_errors_extra_tags) + + lifecycle { + ignore_changes = ["silenced"] + } +} + +# Monitoring App Gateway response_status 5xx (count) +resource "datadog_monitor" "appgateway_http_5xx_errors" { + count = var.appgateway_http_5xx_errors_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway HTTP 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = coalesce(var.appgateway_http_5xx_errors_message, var.message) + type = "query alert" + + query = < ${var.appgateway_http_5xx_errors_threshold_critical} +EOQ + + thresholds = { + warning = var.appgateway_http_5xx_errors_threshold_warning + critical = var.appgateway_http_5xx_errors_threshold_critical + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + notify_no_data = false + renotify_interval = 0 + require_full_window = false + timeout_h = 1 + include_tags = true + + tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-gateway", "team:claranet", "created-by:terraform"], var.appgateway_http_5xx_errors_extra_tags) + + lifecycle { + ignore_changes = ["silenced"] + } +} diff --git a/cloud/azure/app-gateway/outputs.tf b/cloud/azure/app-gateway/outputs.tf new file mode 100644 index 0000000..a50ee60 --- /dev/null +++ b/cloud/azure/app-gateway/outputs.tf @@ -0,0 +1,30 @@ +output "appgateway_failed_requests_id" { + description = "id for monitor appgateway_failed_requests" + value = datadog_monitor.appgateway_failed_requests.*.id +} + +output "appgateway_healthy_host_count_id" { + description = "id for monitor appgateway_healthy_host_count" + value = datadog_monitor.appgateway_healthy_host_count.*.id +} + +output "appgateway_http_4xx_errors_id" { + description = "id for monitor appgateway_http_4xx_errors" + value = datadog_monitor.appgateway_http_4xx_errors.*.id +} + +output "appgateway_http_5xx_errors_id" { + description = "id for monitor appgateway_http_5xx_errors" + value = datadog_monitor.appgateway_http_5xx_errors.*.id +} + +output "appgateway_status_id" { + description = "id for monitor appgateway_status" + value = datadog_monitor.appgateway_status.*.id +} + +output "current_connection_id" { + description = "id for monitor current_connection" + value = datadog_monitor.current_connection.*.id +} + From ce6750feda3f075c6a2e15e1397690392618b62a Mon Sep 17 00:00:00 2001 From: "gauthier.ampe@fr.clara.net" Date: Mon, 23 Sep 2019 17:05:57 +0200 Subject: [PATCH 2/9] MON-366 fix extra-tags --- cloud/azure/app-gateway/modules.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloud/azure/app-gateway/modules.tf b/cloud/azure/app-gateway/modules.tf index f2595dc..ca3991d 100644 --- a/cloud/azure/app-gateway/modules.tf +++ b/cloud/azure/app-gateway/modules.tf @@ -16,7 +16,7 @@ module "filter-tags-4xx-error" { filter_tags_use_defaults = var.filter_tags_use_defaults filter_tags_custom = var.filter_tags_custom filter_tags_custom_excluded = var.filter_tags_custom_excluded - extra_tags = ["httpstatus:5xx"] + extra_tags = ["httpstatusgroup:4xx"] } module "filter-tags-5xx-error" { @@ -27,5 +27,5 @@ module "filter-tags-5xx-error" { filter_tags_use_defaults = var.filter_tags_use_defaults filter_tags_custom = var.filter_tags_custom filter_tags_custom_excluded = var.filter_tags_custom_excluded - extra_tags = ["httpstatus:5xx"] + extra_tags = ["httpstatusgroup:5xx"] } From df45723478d887c278da14ecf5ace99770e66fc6 Mon Sep 17 00:00:00 2001 From: "gauthier.ampe@fr.clara.net" Date: Mon, 23 Sep 2019 17:25:52 +0200 Subject: [PATCH 3/9] MON-366 Remove instance from query filter --- cloud/azure/app-gateway/monitors-app_services.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cloud/azure/app-gateway/monitors-app_services.tf b/cloud/azure/app-gateway/monitors-app_services.tf index bdfb76b..f8e1810 100644 --- a/cloud/azure/app-gateway/monitors-app_services.tf +++ b/cloud/azure/app-gateway/monitors-app_services.tf @@ -129,8 +129,8 @@ resource "datadog_monitor" "appgateway_http_4xx_errors" { query = < ${var.appgateway_http_4xx_errors_threshold_critical} EOQ @@ -164,8 +164,8 @@ resource "datadog_monitor" "appgateway_http_5xx_errors" { query = < ${var.appgateway_http_5xx_errors_threshold_critical} EOQ From 3f442f23d5684a4c5dea0c5d6127b61477e2ab74 Mon Sep 17 00:00:00 2001 From: "gauthier.ampe@fr.clara.net" Date: Tue, 24 Sep 2019 10:58:32 +0200 Subject: [PATCH 4/9] MON-366 Add multiple monitors --- cloud/azure/app-gateway/README.md | 43 +++++- cloud/azure/app-gateway/inputs.tf | 142 ++++++++++++++++-- cloud/azure/app-gateway/modules.tf | 22 +++ .../app-gateway/monitors-app_services.tf | 125 +++++++++++++-- cloud/azure/app-gateway/outputs.tf | 15 ++ 5 files changed, 320 insertions(+), 27 deletions(-) diff --git a/cloud/azure/app-gateway/README.md b/cloud/azure/app-gateway/README.md index 2333b2d..ca1611c 100644 --- a/cloud/azure/app-gateway/README.md +++ b/cloud/azure/app-gateway/README.md @@ -16,22 +16,46 @@ module "datadog-monitors-cloud-azure-app-gateway" { Creates DataDog monitors with the following checks: -- App Gateway failed requests (disabled by default) -- App Gateway HTTP 4xx errors too high -- App Gateway HTTP 5xx errors too high +- App Gateway backend connect time +- App Gateway Backend HTTP 4xx errors +- App Gateway failed requests +- App Gateway HTTP 4xx errors +- App Gateway HTTP 5xx errors +- App Gateway HTTP Backend 5xx errors - App Gateway is down -- App Gateway no connection (disabled by default) +- App Gateway no connection - App Gateway no healthy host ## Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| appgateway\_failed\_requests\_enabled | Flag to enable App Gateway failed requests monitor | string | `"false"` | no | +| appgateway\_backend\_connect\_time\_enabled | Flag to enable App Gateway backend_connect_time monitor | string | `"true"` | no | +| appgateway\_backend\_connect\_time\_extra\_tags | Extra tags for App Gateway backend_connect_time monitor | list(string) | `[]` | no | +| appgateway\_backend\_connect\_time\_message | Custom message for App Gateway backend_connect_time monitor | string | `""` | no | +| appgateway\_backend\_connect\_time\_threshold\_critical | Maximum critical backend_connect_time errors | string | `"50"` | no | +| appgateway\_backend\_connect\_time\_threshold\_warning | Warning regarding backend_connect_time errors | string | `"40"` | no | +| appgateway\_backend\_connect\_time\_time\_aggregator | Monitor aggregator for App Gateway backend_connect_time [available values: min, max or avg] | string | `"min"` | no | +| appgateway\_backend\_connect\_time\_timeframe | Monitor timeframe for App Gateway backend_connect_time [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| appgateway\_backend\_http\_4xx\_errors\_enabled | Flag to enable App Gateway http 4xx errors monitor | string | `"true"` | no | +| appgateway\_backend\_http\_4xx\_errors\_extra\_tags | Extra tags for App Gateway http 4xx errors monitor | list(string) | `[]` | no | +| appgateway\_backend\_http\_4xx\_errors\_message | Custom message for App Gateway http 4xx errors monitor | string | `""` | no | +| appgateway\_backend\_http\_4xx\_errors\_threshold\_critical | Minimum critical acceptable percent of 4xx error | string | `"30"` | no | +| appgateway\_backend\_http\_4xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 4xx error | string | `"20"` | no | +| appgateway\_backend\_http\_4xx\_errors\_time\_aggregator | Monitor aggregator for App Gateway http 4xx errors [available values: min, max or avg] | string | `"max"` | no | +| appgateway\_backend\_http\_4xx\_errors\_timeframe | Monitor timeframe for App Gateway http 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| appgateway\_backend\_http\_5xx\_errors\_enabled | Flag to enable App Gateway http 5xx errors monitor | string | `"true"` | no | +| appgateway\_backend\_http\_5xx\_errors\_extra\_tags | Extra tags for App Gateway http 5xx errors monitor | list(string) | `[]` | no | +| appgateway\_backend\_http\_5xx\_errors\_message | Custom message for App Gateway http 5xx errors monitor | string | `""` | no | +| appgateway\_backend\_http\_5xx\_errors\_threshold\_critical | Minimum critical acceptable percent of 5xx error | string | `"30"` | no | +| appgateway\_backend\_http\_5xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 5xx error | string | `"20"` | no | +| appgateway\_backend\_http\_5xx\_errors\_time\_aggregator | Monitor aggregator for App Gateway http 5xx errors [available values: min, max or avg] | string | `"max"` | no | +| appgateway\_backend\_http\_5xx\_errors\_timeframe | Monitor timeframe for App Gateway http 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| appgateway\_failed\_requests\_enabled | Flag to enable App Gateway failed requests monitor | string | `"true"` | no | | appgateway\_failed\_requests\_extra\_tags | Extra tags for App Gateway failed requests monitor | list(string) | `[]` | no | | appgateway\_failed\_requests\_message | Custom message for App Gateway failed requests monitor | string | `""` | no | -| appgateway\_failed\_requests\_threshold\_critical | Maximum critical acceptable percent of failed errors | string | `"100"` | no | -| appgateway\_failed\_requests\_threshold\_warning | Warning regarding acceptable percent of failed errors | string | `"200"` | no | +| appgateway\_failed\_requests\_threshold\_critical | Maximum critical acceptable percent of failed errors | string | `"50"` | no | +| appgateway\_failed\_requests\_threshold\_warning | Warning regarding acceptable percent of failed errors | string | `"40"` | no | | appgateway\_failed\_requests\_time\_aggregator | Monitor aggregator for App Gateway failed requests [available values: min, max or avg] | string | `"min"` | no | | appgateway\_failed\_requests\_timeframe | Monitor timeframe for App Gateway failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | appgateway\_healthy\_host\_count\_enabled | Flag to enable App Gateway healthy host monitor | string | `"true"` | no | @@ -53,7 +77,7 @@ Creates DataDog monitors with the following checks: | appgateway\_http\_5xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 5xx error | string | `"20"` | no | | appgateway\_http\_5xx\_errors\_time\_aggregator | Monitor aggregator for App Gateway http 5xx errors [available values: min, max or avg] | string | `"max"` | no | | appgateway\_http\_5xx\_errors\_timeframe | Monitor timeframe for App Gateway http 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | -| current\_connection\_enabled | Flag to enable App Gateway current connections monitor | string | `"false"` | no | +| current\_connection\_enabled | Flag to enable App Gateway current connections monitor | string | `"true"` | no | | current\_connection\_extra\_tags | Extra tags for App Gateway current connections monitor | list(string) | `[]` | no | | current\_connection\_message | Custom message for App Gateway current connections monitor | string | `""` | no | | current\_connection\_time\_aggregator | Monitor aggregator for App Gateway current connections [available values: min, max or avg] | string | `"min"` | no | @@ -76,6 +100,9 @@ Creates DataDog monitors with the following checks: | Name | Description | |------|-------------| +| appgateway\_backend\_connect\_time\_id | id for monitor appgateway_backend_connect_time | +| appgateway\_backend\_http\_4xx\_errors\_id | id for monitor appgateway_backend_http_4xx_errors | +| appgateway\_backend\_http\_5xx\_errors\_id | id for monitor appgateway_backend_http_5xx_errors | | appgateway\_failed\_requests\_id | id for monitor appgateway_failed_requests | | appgateway\_healthy\_host\_count\_id | id for monitor appgateway_healthy_host_count | | appgateway\_http\_4xx\_errors\_id | id for monitor appgateway_http_4xx_errors | diff --git a/cloud/azure/app-gateway/inputs.tf b/cloud/azure/app-gateway/inputs.tf index 8cdb5a4..a59c5ae 100644 --- a/cloud/azure/app-gateway/inputs.tf +++ b/cloud/azure/app-gateway/inputs.tf @@ -1,3 +1,4 @@ +# Azure App Gateway global variables variable "environment" { description = "Architecture environment" type = string @@ -69,11 +70,11 @@ variable "status_timeframe" { default = "last_5m" } -# Monitoring App Gateway current_connections (count) +# Monitoring App Gateway current_connections variable "current_connection_enabled" { description = "Flag to enable App Gateway current connections monitor" type = string - default = "false" + default = "true" } variable "current_connection_extra_tags" { @@ -100,11 +101,52 @@ variable "current_connection_timeframe" { default = "last_5m" } -# Monitoring App Gateway failed_requests (count) +# Monitoring App Gateway failed_requests +variable "appgateway_backend_connect_time_enabled" { + description = "Flag to enable App Gateway backend_connect_time monitor" + type = string + default = "true" +} + +variable "appgateway_backend_connect_time_extra_tags" { + description = "Extra tags for App Gateway backend_connect_time monitor" + type = list(string) + default = [] +} + +variable "appgateway_backend_connect_time_message" { + description = "Custom message for App Gateway backend_connect_time monitor" + type = string + default = "" +} + +variable "appgateway_backend_connect_time_time_aggregator" { + description = "Monitor aggregator for App Gateway backend_connect_time [available values: min, max or avg]" + type = string + default = "min" +} + +variable "appgateway_backend_connect_time_timeframe" { + description = "Monitor timeframe for App Gateway backend_connect_time [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +variable "appgateway_backend_connect_time_threshold_critical" { + default = 50 + description = "Maximum critical backend_connect_time errors" +} + +variable "appgateway_backend_connect_time_threshold_warning" { + default = 40 + description = "Warning regarding backend_connect_time errors" +} + +# Monitoring App Gateway failed_requests variable "appgateway_failed_requests_enabled" { description = "Flag to enable App Gateway failed requests monitor" type = string - default = "false" + default = "true" } variable "appgateway_failed_requests_extra_tags" { @@ -132,16 +174,16 @@ variable "appgateway_failed_requests_timeframe" { } variable "appgateway_failed_requests_threshold_critical" { - default = 100 + default = 50 description = "Maximum critical acceptable percent of failed errors" } variable "appgateway_failed_requests_threshold_warning" { - default = 200 + default = 40 description = "Warning regarding acceptable percent of failed errors" } -# Monitoring App Gateway healthy_host_count (count) +# Monitoring App Gateway healthy_host_count variable "appgateway_healthy_host_count_enabled" { description = "Flag to enable App Gateway healthy host monitor" type = string @@ -172,7 +214,7 @@ variable "appgateway_healthy_host_count_timeframe" { default = "last_5m" } -# Monitoring App Gateway response_status 4xx (count) +# Monitoring App Gateway response_status 4xx variable "appgateway_http_4xx_errors_enabled" { description = "Flag to enable App Gateway http 4xx errors monitor" type = string @@ -213,7 +255,7 @@ variable "appgateway_http_4xx_errors_threshold_warning" { description = "Warning regarding acceptable percent of 4xx error" } -# Monitoring App Gateway response_status 5xx (count) +# Monitoring App Gateway response_status 5xx variable "appgateway_http_5xx_errors_enabled" { description = "Flag to enable App Gateway http 5xx errors monitor" type = string @@ -253,3 +295,85 @@ variable "appgateway_http_5xx_errors_threshold_warning" { default = 20 description = "Warning regarding acceptable percent of 5xx error" } + +# Monitoring App Gateway Backend response_status 4xx +variable "appgateway_backend_http_4xx_errors_enabled" { + description = "Flag to enable App Gateway http 4xx errors monitor" + type = string + default = "true" +} + +variable "appgateway_backend_http_4xx_errors_extra_tags" { + description = "Extra tags for App Gateway http 4xx errors monitor" + type = list(string) + default = [] +} + +variable "appgateway_backend_http_4xx_errors_message" { + description = "Custom message for App Gateway http 4xx errors monitor" + type = string + default = "" +} + +variable "appgateway_backend_http_4xx_errors_time_aggregator" { + description = "Monitor aggregator for App Gateway http 4xx errors [available values: min, max or avg]" + type = string + default = "max" +} + +variable "appgateway_backend_http_4xx_errors_timeframe" { + description = "Monitor timeframe for App Gateway http 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +variable "appgateway_backend_http_4xx_errors_threshold_critical" { + default = 30 + description = "Minimum critical acceptable percent of 4xx error" +} + +variable "appgateway_backend_http_4xx_errors_threshold_warning" { + default = 20 + description = "Warning regarding acceptable percent of 4xx error" +} + +# Monitoring App Gateway Backend response_status 5xx +variable "appgateway_backend_http_5xx_errors_enabled" { + description = "Flag to enable App Gateway http 5xx errors monitor" + type = string + default = "true" +} + +variable "appgateway_backend_http_5xx_errors_extra_tags" { + description = "Extra tags for App Gateway http 5xx errors monitor" + type = list(string) + default = [] +} + +variable "appgateway_backend_http_5xx_errors_message" { + description = "Custom message for App Gateway http 5xx errors monitor" + type = string + default = "" +} + +variable "appgateway_backend_http_5xx_errors_time_aggregator" { + description = "Monitor aggregator for App Gateway http 5xx errors [available values: min, max or avg]" + type = string + default = "max" +} + +variable "appgateway_backend_http_5xx_errors_timeframe" { + description = "Monitor timeframe for App Gateway http 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +variable "appgateway_backend_http_5xx_errors_threshold_critical" { + default = 30 + description = "Minimum critical acceptable percent of 5xx error" +} + +variable "appgateway_backend_http_5xx_errors_threshold_warning" { + default = 20 + description = "Warning regarding acceptable percent of 5xx error" +} diff --git a/cloud/azure/app-gateway/modules.tf b/cloud/azure/app-gateway/modules.tf index ca3991d..12b0c58 100644 --- a/cloud/azure/app-gateway/modules.tf +++ b/cloud/azure/app-gateway/modules.tf @@ -29,3 +29,25 @@ module "filter-tags-5xx-error" { filter_tags_custom_excluded = var.filter_tags_custom_excluded extra_tags = ["httpstatusgroup:5xx"] } + +module "filter-tags-backend-4xx-error" { + source = "../../../common/filter-tags" + + environment = var.environment + resource = "azure_app-gateway" + filter_tags_use_defaults = var.filter_tags_use_defaults + filter_tags_custom = var.filter_tags_custom + filter_tags_custom_excluded = var.filter_tags_custom_excluded + extra_tags = ["httpstatusgroup:4xx"] +} + +module "filter-tags-backend-5xx-error" { + source = "../../../common/filter-tags" + + environment = var.environment + resource = "azure_app-gateway" + filter_tags_use_defaults = var.filter_tags_use_defaults + filter_tags_custom = var.filter_tags_custom + filter_tags_custom_excluded = var.filter_tags_custom_excluded + extra_tags = ["httpstatusgroup:5xx"] +} diff --git a/cloud/azure/app-gateway/monitors-app_services.tf b/cloud/azure/app-gateway/monitors-app_services.tf index f8e1810..4ae74d6 100644 --- a/cloud/azure/app-gateway/monitors-app_services.tf +++ b/cloud/azure/app-gateway/monitors-app_services.tf @@ -28,7 +28,7 @@ EOQ } } -# Monitoring App Gateway current_connections (count) +# Monitoring App Gateway current_connections resource "datadog_monitor" "current_connection" { count = var.current_connection_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway no connection {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" @@ -57,16 +57,52 @@ EOQ } } -# Monitoring App Gateway failed_requests (count) +# Monitoring App Gateway backend_connect_time +resource "datadog_monitor" "appgateway_backend_connect_time" { + count = var.appgateway_backend_connect_time_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway backend connect time {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = coalesce(var.appgateway_backend_connect_time_message, var.message) + type = "query alert" + + query = < ${var.appgateway_backend_connect_time_threshold_critical} +EOQ + + thresholds = { + critical = var.appgateway_backend_connect_time_threshold_critical + warning = var.appgateway_backend_connect_time_threshold_warning + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + notify_no_data = false + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = false + + tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-gateway", "team:claranet", "created-by:terraform"], var.appgateway_backend_connect_time_extra_tags) + + lifecycle { + ignore_changes = ["silenced"] + } +} + +# Monitoring App Gateway failed_requests resource "datadog_monitor" "appgateway_failed_requests" { count = var.appgateway_failed_requests_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway failed requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = coalesce(var.appgateway_failed_requests_message, var.message) - type = "query alert" + type = "metric alert" query = < ${var.appgateway_failed_requests_threshold_critical} + ${var.appgateway_failed_requests_time_aggregator}(${var.appgateway_failed_requests_timeframe}): + default((default(avg:azure.network_applicationgateways.failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / + default(avg:azure.network_applicationgateways.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(),0) + * 100),0) > ${var.appgateway_failed_requests_threshold_critical} EOQ thresholds = { @@ -91,7 +127,7 @@ EOQ } } -# Monitoring App Gateway healthy_host_count (count) +# Monitoring App Gateway healthy_host_count resource "datadog_monitor" "appgateway_healthy_host_count" { count = var.appgateway_healthy_host_count_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway no healthy host {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" @@ -120,10 +156,10 @@ EOQ } } -# Monitoring App Gateway response_status 4xx (count) +# Monitoring App Gateway response_status 4xx resource "datadog_monitor" "appgateway_http_4xx_errors" { count = var.appgateway_http_4xx_errors_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway HTTP 4xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway HTTP 4xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = coalesce(var.appgateway_http_4xx_errors_message, var.message) type = "metric alert" @@ -155,10 +191,10 @@ EOQ } } -# Monitoring App Gateway response_status 5xx (count) +# Monitoring App Gateway response_status 5xx resource "datadog_monitor" "appgateway_http_5xx_errors" { count = var.appgateway_http_5xx_errors_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway HTTP 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway HTTP 5xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = coalesce(var.appgateway_http_5xx_errors_message, var.message) type = "query alert" @@ -188,3 +224,72 @@ EOQ ignore_changes = ["silenced"] } } + +# Monitoring App Gateway Backend response_status 4xx +resource "datadog_monitor" "appgateway_backend_http_4xx_errors" { + count = var.appgateway_backend_http_4xx_errors_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway Backend HTTP 4xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = coalesce(var.appgateway_backend_http_4xx_errors_message, var.message) + type = "metric alert" + + query = < ${var.appgateway_backend_http_4xx_errors_threshold_critical} +EOQ + + + thresholds = { + warning = var.appgateway_backend_http_4xx_errors_threshold_warning + critical = var.appgateway_backend_http_4xx_errors_threshold_critical + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + notify_no_data = false + renotify_interval = 0 + require_full_window = false + timeout_h = 1 + include_tags = true + + tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-gateway", "team:claranet", "created-by:terraform"], var.appgateway_backend_http_4xx_errors_extra_tags) + + lifecycle { + ignore_changes = ["silenced"] + } +} + +# Monitoring App Gateway Backend response_status 5xx +resource "datadog_monitor" "appgateway_backend_http_5xx_errors" { + count = var.appgateway_backend_http_5xx_errors_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway HTTP Backend 5xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = coalesce(var.appgateway_backend_http_5xx_errors_message, var.message) + type = "query alert" + + query = < ${var.appgateway_backend_http_5xx_errors_threshold_critical} +EOQ + + thresholds = { + warning = var.appgateway_backend_http_5xx_errors_threshold_warning + critical = var.appgateway_backend_http_5xx_errors_threshold_critical + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + notify_no_data = false + renotify_interval = 0 + require_full_window = false + timeout_h = 1 + include_tags = true + + tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-gateway", "team:claranet", "created-by:terraform"], var.appgateway_backend_http_5xx_errors_extra_tags) + + lifecycle { + ignore_changes = ["silenced"] + } +} diff --git a/cloud/azure/app-gateway/outputs.tf b/cloud/azure/app-gateway/outputs.tf index a50ee60..5c5c774 100644 --- a/cloud/azure/app-gateway/outputs.tf +++ b/cloud/azure/app-gateway/outputs.tf @@ -1,3 +1,18 @@ +output "appgateway_backend_connect_time_id" { + description = "id for monitor appgateway_backend_connect_time" + value = datadog_monitor.appgateway_backend_connect_time.*.id +} + +output "appgateway_backend_http_4xx_errors_id" { + description = "id for monitor appgateway_backend_http_4xx_errors" + value = datadog_monitor.appgateway_backend_http_4xx_errors.*.id +} + +output "appgateway_backend_http_5xx_errors_id" { + description = "id for monitor appgateway_backend_http_5xx_errors" + value = datadog_monitor.appgateway_backend_http_5xx_errors.*.id +} + output "appgateway_failed_requests_id" { description = "id for monitor appgateway_failed_requests" value = datadog_monitor.appgateway_failed_requests.*.id From 615ff0e78edcee79325090e703201cc9975293c3 Mon Sep 17 00:00:00 2001 From: "gauthier.ampe@fr.clara.net" Date: Wed, 25 Sep 2019 17:30:17 +0200 Subject: [PATCH 5/9] MON-366 Change file name with monitors-app_gateway.tf --- .../{monitors-app_services.tf => monitors-app_gateway.tf} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename cloud/azure/app-gateway/{monitors-app_services.tf => monitors-app_gateway.tf} (99%) diff --git a/cloud/azure/app-gateway/monitors-app_services.tf b/cloud/azure/app-gateway/monitors-app_gateway.tf similarity index 99% rename from cloud/azure/app-gateway/monitors-app_services.tf rename to cloud/azure/app-gateway/monitors-app_gateway.tf index 4ae74d6..49995ab 100644 --- a/cloud/azure/app-gateway/monitors-app_services.tf +++ b/cloud/azure/app-gateway/monitors-app_gateway.tf @@ -3,7 +3,7 @@ resource "datadog_monitor" "appgateway_status" { count = var.status_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway is down" message = coalesce(var.status_message, var.message) - type = "query alert" + type = "metric alert" query = < Date: Thu, 26 Sep 2019 17:46:59 +0200 Subject: [PATCH 6/9] MON-366 Improve 4xx and 5xx backend monitor --- cloud/azure/app-gateway/monitors-app_gateway.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cloud/azure/app-gateway/monitors-app_gateway.tf b/cloud/azure/app-gateway/monitors-app_gateway.tf index 49995ab..7ba011f 100644 --- a/cloud/azure/app-gateway/monitors-app_gateway.tf +++ b/cloud/azure/app-gateway/monitors-app_gateway.tf @@ -234,8 +234,8 @@ resource "datadog_monitor" "appgateway_backend_http_4xx_errors" { query = < ${var.appgateway_backend_http_4xx_errors_threshold_critical} EOQ @@ -269,8 +269,8 @@ resource "datadog_monitor" "appgateway_backend_http_5xx_errors" { query = < ${var.appgateway_backend_http_5xx_errors_threshold_critical} EOQ From 0097fd15acf13528d1f352769d9c6969d2a75a4b Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Tue, 5 Nov 2019 10:33:27 +0100 Subject: [PATCH 7/9] MON-366 Update Azure Application Gateway monitors with suggestions --- cloud/azure/app-gateway/README.md | 46 +++++++++---------- cloud/azure/app-gateway/inputs.tf | 36 +++++++-------- .../azure/app-gateway/monitors-app_gateway.tf | 32 ++++++------- 3 files changed, 57 insertions(+), 57 deletions(-) diff --git a/cloud/azure/app-gateway/README.md b/cloud/azure/app-gateway/README.md index ca1611c..1ec341d 100644 --- a/cloud/azure/app-gateway/README.md +++ b/cloud/azure/app-gateway/README.md @@ -16,15 +16,15 @@ module "datadog-monitors-cloud-azure-app-gateway" { Creates DataDog monitors with the following checks: -- App Gateway backend connect time -- App Gateway Backend HTTP 4xx errors +- App Gateway backend connect time is too high +- App Gateway backend has no healthy host +- App Gateway backend HTTP 4xx errors rate is too high +- App Gateway backend HTTP 5xx errors rate is too high - App Gateway failed requests -- App Gateway HTTP 4xx errors -- App Gateway HTTP 5xx errors -- App Gateway HTTP Backend 5xx errors +- App Gateway has no connection +- App Gateway HTTP 4xx errors rate is too high +- App Gateway HTTP 5xx errors rate is too high - App Gateway is down -- App Gateway no connection -- App Gateway no healthy host ## Inputs @@ -33,54 +33,54 @@ Creates DataDog monitors with the following checks: | appgateway\_backend\_connect\_time\_enabled | Flag to enable App Gateway backend_connect_time monitor | string | `"true"` | no | | appgateway\_backend\_connect\_time\_extra\_tags | Extra tags for App Gateway backend_connect_time monitor | list(string) | `[]` | no | | appgateway\_backend\_connect\_time\_message | Custom message for App Gateway backend_connect_time monitor | string | `""` | no | -| appgateway\_backend\_connect\_time\_threshold\_critical | Maximum critical backend_connect_time errors | string | `"50"` | no | -| appgateway\_backend\_connect\_time\_threshold\_warning | Warning regarding backend_connect_time errors | string | `"40"` | no | -| appgateway\_backend\_connect\_time\_time\_aggregator | Monitor aggregator for App Gateway backend_connect_time [available values: min, max or avg] | string | `"min"` | no | +| appgateway\_backend\_connect\_time\_threshold\_critical | Maximum critical backend_connect_time errors in seconds | string | `"50"` | no | +| appgateway\_backend\_connect\_time\_threshold\_warning | Warning regarding backend_connect_time errors in seconds | string | `"40"` | no | +| appgateway\_backend\_connect\_time\_time\_aggregator | Monitor aggregator for App Gateway backend_connect_time [available values: min, max or avg] | string | `"max"` | no | | appgateway\_backend\_connect\_time\_timeframe | Monitor timeframe for App Gateway backend_connect_time [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | appgateway\_backend\_http\_4xx\_errors\_enabled | Flag to enable App Gateway http 4xx errors monitor | string | `"true"` | no | | appgateway\_backend\_http\_4xx\_errors\_extra\_tags | Extra tags for App Gateway http 4xx errors monitor | list(string) | `[]` | no | | appgateway\_backend\_http\_4xx\_errors\_message | Custom message for App Gateway http 4xx errors monitor | string | `""` | no | -| appgateway\_backend\_http\_4xx\_errors\_threshold\_critical | Minimum critical acceptable percent of 4xx error | string | `"30"` | no | -| appgateway\_backend\_http\_4xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 4xx error | string | `"20"` | no | +| appgateway\_backend\_http\_4xx\_errors\_threshold\_critical | Minimum critical acceptable percent of 4xx error | string | `"95"` | no | +| appgateway\_backend\_http\_4xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 4xx error | string | `"80"` | no | | appgateway\_backend\_http\_4xx\_errors\_time\_aggregator | Monitor aggregator for App Gateway http 4xx errors [available values: min, max or avg] | string | `"max"` | no | | appgateway\_backend\_http\_4xx\_errors\_timeframe | Monitor timeframe for App Gateway http 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | appgateway\_backend\_http\_5xx\_errors\_enabled | Flag to enable App Gateway http 5xx errors monitor | string | `"true"` | no | | appgateway\_backend\_http\_5xx\_errors\_extra\_tags | Extra tags for App Gateway http 5xx errors monitor | list(string) | `[]` | no | | appgateway\_backend\_http\_5xx\_errors\_message | Custom message for App Gateway http 5xx errors monitor | string | `""` | no | -| appgateway\_backend\_http\_5xx\_errors\_threshold\_critical | Minimum critical acceptable percent of 5xx error | string | `"30"` | no | -| appgateway\_backend\_http\_5xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 5xx error | string | `"20"` | no | +| appgateway\_backend\_http\_5xx\_errors\_threshold\_critical | Minimum critical acceptable percent of 5xx error | string | `"95"` | no | +| appgateway\_backend\_http\_5xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 5xx error | string | `"80"` | no | | appgateway\_backend\_http\_5xx\_errors\_time\_aggregator | Monitor aggregator for App Gateway http 5xx errors [available values: min, max or avg] | string | `"max"` | no | | appgateway\_backend\_http\_5xx\_errors\_timeframe | Monitor timeframe for App Gateway http 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | appgateway\_failed\_requests\_enabled | Flag to enable App Gateway failed requests monitor | string | `"true"` | no | | appgateway\_failed\_requests\_extra\_tags | Extra tags for App Gateway failed requests monitor | list(string) | `[]` | no | | appgateway\_failed\_requests\_message | Custom message for App Gateway failed requests monitor | string | `""` | no | -| appgateway\_failed\_requests\_threshold\_critical | Maximum critical acceptable percent of failed errors | string | `"50"` | no | -| appgateway\_failed\_requests\_threshold\_warning | Warning regarding acceptable percent of failed errors | string | `"40"` | no | +| appgateway\_failed\_requests\_threshold\_critical | Maximum critical acceptable percent of failed errors | string | `"95"` | no | +| appgateway\_failed\_requests\_threshold\_warning | Warning regarding acceptable percent of failed errors | string | `"80"` | no | | appgateway\_failed\_requests\_time\_aggregator | Monitor aggregator for App Gateway failed requests [available values: min, max or avg] | string | `"min"` | no | | appgateway\_failed\_requests\_timeframe | Monitor timeframe for App Gateway failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | appgateway\_healthy\_host\_count\_enabled | Flag to enable App Gateway healthy host monitor | string | `"true"` | no | | appgateway\_healthy\_host\_count\_extra\_tags | Extra tags for App Gateway healthy host monitor | list(string) | `[]` | no | | appgateway\_healthy\_host\_count\_message | Custom message for App Gateway healthy host monitor | string | `""` | no | -| appgateway\_healthy\_host\_count\_time\_aggregator | Monitor aggregator for App Gateway healthy host [available values: min, max or avg] | string | `"min"` | no | +| appgateway\_healthy\_host\_count\_time\_aggregator | Monitor aggregator for App Gateway healthy host [available values: min, max or avg] | string | `"max"` | no | | appgateway\_healthy\_host\_count\_timeframe | Monitor timeframe for App Gateway healthy host [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | appgateway\_http\_4xx\_errors\_enabled | Flag to enable App Gateway http 4xx errors monitor | string | `"true"` | no | | appgateway\_http\_4xx\_errors\_extra\_tags | Extra tags for App Gateway http 4xx errors monitor | list(string) | `[]` | no | | appgateway\_http\_4xx\_errors\_message | Custom message for App Gateway http 4xx errors monitor | string | `""` | no | -| appgateway\_http\_4xx\_errors\_threshold\_critical | Minimum critical acceptable percent of 4xx error | string | `"30"` | no | -| appgateway\_http\_4xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 4xx error | string | `"20"` | no | +| appgateway\_http\_4xx\_errors\_threshold\_critical | Maximum critical acceptable percent of 4xx error | string | `"95"` | no | +| appgateway\_http\_4xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 4xx error | string | `"80"` | no | | appgateway\_http\_4xx\_errors\_time\_aggregator | Monitor aggregator for App Gateway http 4xx errors [available values: min, max or avg] | string | `"max"` | no | | appgateway\_http\_4xx\_errors\_timeframe | Monitor timeframe for App Gateway http 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | appgateway\_http\_5xx\_errors\_enabled | Flag to enable App Gateway http 5xx errors monitor | string | `"true"` | no | | appgateway\_http\_5xx\_errors\_extra\_tags | Extra tags for App Gateway http 5xx errors monitor | list(string) | `[]` | no | | appgateway\_http\_5xx\_errors\_message | Custom message for App Gateway http 5xx errors monitor | string | `""` | no | -| appgateway\_http\_5xx\_errors\_threshold\_critical | Minimum critical acceptable percent of 5xx error | string | `"30"` | no | -| appgateway\_http\_5xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 5xx error | string | `"20"` | no | +| appgateway\_http\_5xx\_errors\_threshold\_critical | Maximum critical acceptable percent of 5xx error | string | `"95"` | no | +| appgateway\_http\_5xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 5xx error | string | `"80"` | no | | appgateway\_http\_5xx\_errors\_time\_aggregator | Monitor aggregator for App Gateway http 5xx errors [available values: min, max or avg] | string | `"max"` | no | | appgateway\_http\_5xx\_errors\_timeframe | Monitor timeframe for App Gateway http 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | current\_connection\_enabled | Flag to enable App Gateway current connections monitor | string | `"true"` | no | | current\_connection\_extra\_tags | Extra tags for App Gateway current connections monitor | list(string) | `[]` | no | | current\_connection\_message | Custom message for App Gateway current connections monitor | string | `""` | no | -| current\_connection\_time\_aggregator | Monitor aggregator for App Gateway current connections [available values: min, max or avg] | string | `"min"` | no | +| current\_connection\_time\_aggregator | Monitor aggregator for App Gateway current connections [available values: min, max or avg] | string | `"max"` | no | | current\_connection\_timeframe | Monitor timeframe for App Gateway current connections [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | environment | Architecture environment | string | n/a | yes | | evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no | @@ -93,7 +93,7 @@ Creates DataDog monitors with the following checks: | status\_enabled | Flag to enable App Gateway status | string | `"true"` | no | | status\_extra\_tags | Extra tags for App Gateway status | list(string) | `[]` | no | | status\_message | Custom message for App Gateway status | string | `""` | no | -| status\_time\_aggregator | Monitor aggregator for App Gateway status [available values: min, max or avg] | string | `"min"` | no | +| status\_time\_aggregator | Monitor aggregator for App Gateway status [available values: min, max or avg] | string | `"max"` | no | | status\_timeframe | Monitor timeframe for App Gateway status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | ## Outputs diff --git a/cloud/azure/app-gateway/inputs.tf b/cloud/azure/app-gateway/inputs.tf index a59c5ae..1040aad 100644 --- a/cloud/azure/app-gateway/inputs.tf +++ b/cloud/azure/app-gateway/inputs.tf @@ -61,7 +61,7 @@ variable "status_message" { variable "status_time_aggregator" { description = "Monitor aggregator for App Gateway status [available values: min, max or avg]" type = string - default = "min" + default = "max" } variable "status_timeframe" { @@ -92,7 +92,7 @@ variable "current_connection_message" { variable "current_connection_time_aggregator" { description = "Monitor aggregator for App Gateway current connections [available values: min, max or avg]" type = string - default = "min" + default = "max" } variable "current_connection_timeframe" { @@ -123,7 +123,7 @@ variable "appgateway_backend_connect_time_message" { variable "appgateway_backend_connect_time_time_aggregator" { description = "Monitor aggregator for App Gateway backend_connect_time [available values: min, max or avg]" type = string - default = "min" + default = "max" } variable "appgateway_backend_connect_time_timeframe" { @@ -134,12 +134,12 @@ variable "appgateway_backend_connect_time_timeframe" { variable "appgateway_backend_connect_time_threshold_critical" { default = 50 - description = "Maximum critical backend_connect_time errors" + description = "Maximum critical backend_connect_time errors in seconds" } variable "appgateway_backend_connect_time_threshold_warning" { default = 40 - description = "Warning regarding backend_connect_time errors" + description = "Warning regarding backend_connect_time errors in seconds" } # Monitoring App Gateway failed_requests @@ -174,12 +174,12 @@ variable "appgateway_failed_requests_timeframe" { } variable "appgateway_failed_requests_threshold_critical" { - default = 50 + default = 95 description = "Maximum critical acceptable percent of failed errors" } variable "appgateway_failed_requests_threshold_warning" { - default = 40 + default = 80 description = "Warning regarding acceptable percent of failed errors" } @@ -205,7 +205,7 @@ variable "appgateway_healthy_host_count_message" { variable "appgateway_healthy_host_count_time_aggregator" { description = "Monitor aggregator for App Gateway healthy host [available values: min, max or avg]" type = string - default = "min" + default = "max" } variable "appgateway_healthy_host_count_timeframe" { @@ -246,12 +246,12 @@ variable "appgateway_http_4xx_errors_timeframe" { } variable "appgateway_http_4xx_errors_threshold_critical" { - default = 30 - description = "Minimum critical acceptable percent of 4xx error" + default = 95 + description = "Maximum critical acceptable percent of 4xx error" } variable "appgateway_http_4xx_errors_threshold_warning" { - default = 20 + default = 80 description = "Warning regarding acceptable percent of 4xx error" } @@ -287,12 +287,12 @@ variable "appgateway_http_5xx_errors_timeframe" { } variable "appgateway_http_5xx_errors_threshold_critical" { - default = 30 - description = "Minimum critical acceptable percent of 5xx error" + default = 95 + description = "Maximum critical acceptable percent of 5xx error" } variable "appgateway_http_5xx_errors_threshold_warning" { - default = 20 + default = 80 description = "Warning regarding acceptable percent of 5xx error" } @@ -328,12 +328,12 @@ variable "appgateway_backend_http_4xx_errors_timeframe" { } variable "appgateway_backend_http_4xx_errors_threshold_critical" { - default = 30 + default = 95 description = "Minimum critical acceptable percent of 4xx error" } variable "appgateway_backend_http_4xx_errors_threshold_warning" { - default = 20 + default = 80 description = "Warning regarding acceptable percent of 4xx error" } @@ -369,11 +369,11 @@ variable "appgateway_backend_http_5xx_errors_timeframe" { } variable "appgateway_backend_http_5xx_errors_threshold_critical" { - default = 30 + default = 95 description = "Minimum critical acceptable percent of 5xx error" } variable "appgateway_backend_http_5xx_errors_threshold_warning" { - default = 20 + default = 80 description = "Warning regarding acceptable percent of 5xx error" } diff --git a/cloud/azure/app-gateway/monitors-app_gateway.tf b/cloud/azure/app-gateway/monitors-app_gateway.tf index 7ba011f..a204d96 100644 --- a/cloud/azure/app-gateway/monitors-app_gateway.tf +++ b/cloud/azure/app-gateway/monitors-app_gateway.tf @@ -31,13 +31,13 @@ EOQ # Monitoring App Gateway current_connections resource "datadog_monitor" "current_connection" { count = var.current_connection_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway no connection {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway has no connection" message = coalesce(var.current_connection_message, var.message) type = "query alert" query = < ${var.appgateway_backend_connect_time_threshold_critical} + sum:azure.network_applicationgateways.backend_connect_time${module.filter-tags.query_alert} by {resource_group,region,name,backendhttpsetting,backendpool,backendserver} > ${var.appgateway_backend_connect_time_threshold_critical} EOQ thresholds = { @@ -100,8 +100,8 @@ resource "datadog_monitor" "appgateway_failed_requests" { query = < ${var.appgateway_failed_requests_threshold_critical} EOQ @@ -130,13 +130,13 @@ EOQ # Monitoring App Gateway healthy_host_count resource "datadog_monitor" "appgateway_healthy_host_count" { count = var.appgateway_healthy_host_count_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway no healthy host {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway backend has no healthy host" message = coalesce(var.appgateway_healthy_host_count_message, var.message) type = "query alert" query = < ${var.appgateway_backend_http_4xx_errors_threshold_critical} EOQ @@ -263,14 +263,14 @@ EOQ # Monitoring App Gateway Backend response_status 5xx resource "datadog_monitor" "appgateway_backend_http_5xx_errors" { count = var.appgateway_backend_http_5xx_errors_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway HTTP Backend 5xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway backend HTTP 5xx errors rate is too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = coalesce(var.appgateway_backend_http_5xx_errors_message, var.message) type = "query alert" query = < ${var.appgateway_backend_http_5xx_errors_threshold_critical} EOQ From 1de02e53d84f59cf9eeb9766a8854fb0f7d44366 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Tue, 5 Nov 2019 12:05:16 +0100 Subject: [PATCH 8/9] MON-366 Change static healthy host check to a ratio --- cloud/azure/app-gateway/README.md | 20 ++++++----- cloud/azure/app-gateway/inputs.tf | 36 ++++++++++++------- .../azure/app-gateway/monitors-app_gateway.tf | 23 +++++++----- cloud/azure/app-gateway/outputs.tf | 6 ++-- 4 files changed, 52 insertions(+), 33 deletions(-) diff --git a/cloud/azure/app-gateway/README.md b/cloud/azure/app-gateway/README.md index 1ec341d..23ed4a1 100644 --- a/cloud/azure/app-gateway/README.md +++ b/cloud/azure/app-gateway/README.md @@ -17,9 +17,9 @@ module "datadog-monitors-cloud-azure-app-gateway" { Creates DataDog monitors with the following checks: - App Gateway backend connect time is too high -- App Gateway backend has no healthy host - App Gateway backend HTTP 4xx errors rate is too high - App Gateway backend HTTP 5xx errors rate is too high +- App Gateway backend unhealthy host ratio is too high - App Gateway failed requests - App Gateway has no connection - App Gateway HTTP 4xx errors rate is too high @@ -33,8 +33,8 @@ Creates DataDog monitors with the following checks: | appgateway\_backend\_connect\_time\_enabled | Flag to enable App Gateway backend_connect_time monitor | string | `"true"` | no | | appgateway\_backend\_connect\_time\_extra\_tags | Extra tags for App Gateway backend_connect_time monitor | list(string) | `[]` | no | | appgateway\_backend\_connect\_time\_message | Custom message for App Gateway backend_connect_time monitor | string | `""` | no | -| appgateway\_backend\_connect\_time\_threshold\_critical | Maximum critical backend_connect_time errors in seconds | string | `"50"` | no | -| appgateway\_backend\_connect\_time\_threshold\_warning | Warning regarding backend_connect_time errors in seconds | string | `"40"` | no | +| appgateway\_backend\_connect\_time\_threshold\_critical | Maximum critical backend_connect_time errors in milliseconds | string | `"50"` | no | +| appgateway\_backend\_connect\_time\_threshold\_warning | Warning regarding backend_connect_time errors in milliseconds | string | `"40"` | no | | appgateway\_backend\_connect\_time\_time\_aggregator | Monitor aggregator for App Gateway backend_connect_time [available values: min, max or avg] | string | `"max"` | no | | appgateway\_backend\_connect\_time\_timeframe | Monitor timeframe for App Gateway backend_connect_time [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | appgateway\_backend\_http\_4xx\_errors\_enabled | Flag to enable App Gateway http 4xx errors monitor | string | `"true"` | no | @@ -58,11 +58,6 @@ Creates DataDog monitors with the following checks: | appgateway\_failed\_requests\_threshold\_warning | Warning regarding acceptable percent of failed errors | string | `"80"` | no | | appgateway\_failed\_requests\_time\_aggregator | Monitor aggregator for App Gateway failed requests [available values: min, max or avg] | string | `"min"` | no | | appgateway\_failed\_requests\_timeframe | Monitor timeframe for App Gateway failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | -| appgateway\_healthy\_host\_count\_enabled | Flag to enable App Gateway healthy host monitor | string | `"true"` | no | -| appgateway\_healthy\_host\_count\_extra\_tags | Extra tags for App Gateway healthy host monitor | list(string) | `[]` | no | -| appgateway\_healthy\_host\_count\_message | Custom message for App Gateway healthy host monitor | string | `""` | no | -| appgateway\_healthy\_host\_count\_time\_aggregator | Monitor aggregator for App Gateway healthy host [available values: min, max or avg] | string | `"max"` | no | -| appgateway\_healthy\_host\_count\_timeframe | Monitor timeframe for App Gateway healthy host [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | appgateway\_http\_4xx\_errors\_enabled | Flag to enable App Gateway http 4xx errors monitor | string | `"true"` | no | | appgateway\_http\_4xx\_errors\_extra\_tags | Extra tags for App Gateway http 4xx errors monitor | list(string) | `[]` | no | | appgateway\_http\_4xx\_errors\_message | Custom message for App Gateway http 4xx errors monitor | string | `""` | no | @@ -77,6 +72,13 @@ Creates DataDog monitors with the following checks: | appgateway\_http\_5xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 5xx error | string | `"80"` | no | | appgateway\_http\_5xx\_errors\_time\_aggregator | Monitor aggregator for App Gateway http 5xx errors [available values: min, max or avg] | string | `"max"` | no | | appgateway\_http\_5xx\_errors\_timeframe | Monitor timeframe for App Gateway http 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| appgateway\_unhealthy\_host\_ratio\_enabled | Flag to enable App Gateway unhealthy host ratio monitor | string | `"true"` | no | +| appgateway\_unhealthy\_host\_ratio\_extra\_tags | Extra tags for App Gateway unhealthy host ratio monitor | list(string) | `[]` | no | +| appgateway\_unhealthy\_host\_ratio\_message | Custom message for App Gateway unhealthy host ratio monitor | string | `""` | no | +| appgateway\_unhealthy\_host\_ratio\_threshold\_critical | Maximum critical acceptable ratio of unhealthy host | string | `"75"` | no | +| appgateway\_unhealthy\_host\_ratio\_threshold\_warning | Warning regarding acceptable ratio of unhealthy host | string | `"50"` | no | +| appgateway\_unhealthy\_host\_ratio\_time\_aggregator | Monitor aggregator for App Gateway unhealthy host ratio [available values: min, max or avg] | string | `"max"` | no | +| appgateway\_unhealthy\_host\_ratio\_timeframe | Monitor timeframe for App Gateway unhealthy host ratio [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | current\_connection\_enabled | Flag to enable App Gateway current connections monitor | string | `"true"` | no | | current\_connection\_extra\_tags | Extra tags for App Gateway current connections monitor | list(string) | `[]` | no | | current\_connection\_message | Custom message for App Gateway current connections monitor | string | `""` | no | @@ -104,7 +106,7 @@ Creates DataDog monitors with the following checks: | appgateway\_backend\_http\_4xx\_errors\_id | id for monitor appgateway_backend_http_4xx_errors | | appgateway\_backend\_http\_5xx\_errors\_id | id for monitor appgateway_backend_http_5xx_errors | | appgateway\_failed\_requests\_id | id for monitor appgateway_failed_requests | -| appgateway\_healthy\_host\_count\_id | id for monitor appgateway_healthy_host_count | +| appgateway\_healthy\_host\_ratio\_id | id for monitor appgateway_healthy_host_ratio | | appgateway\_http\_4xx\_errors\_id | id for monitor appgateway_http_4xx_errors | | appgateway\_http\_5xx\_errors\_id | id for monitor appgateway_http_5xx_errors | | appgateway\_status\_id | id for monitor appgateway_status | diff --git a/cloud/azure/app-gateway/inputs.tf b/cloud/azure/app-gateway/inputs.tf index 1040aad..91d338e 100644 --- a/cloud/azure/app-gateway/inputs.tf +++ b/cloud/azure/app-gateway/inputs.tf @@ -134,12 +134,12 @@ variable "appgateway_backend_connect_time_timeframe" { variable "appgateway_backend_connect_time_threshold_critical" { default = 50 - description = "Maximum critical backend_connect_time errors in seconds" + description = "Maximum critical backend_connect_time errors in milliseconds" } variable "appgateway_backend_connect_time_threshold_warning" { default = 40 - description = "Warning regarding backend_connect_time errors in seconds" + description = "Warning regarding backend_connect_time errors in milliseconds" } # Monitoring App Gateway failed_requests @@ -183,37 +183,47 @@ variable "appgateway_failed_requests_threshold_warning" { description = "Warning regarding acceptable percent of failed errors" } -# Monitoring App Gateway healthy_host_count -variable "appgateway_healthy_host_count_enabled" { - description = "Flag to enable App Gateway healthy host monitor" +# Monitoring App Gateway unhealthy_host_ratio +variable "appgateway_unhealthy_host_ratio_enabled" { + description = "Flag to enable App Gateway unhealthy host ratio monitor" type = string default = "true" } -variable "appgateway_healthy_host_count_extra_tags" { - description = "Extra tags for App Gateway healthy host monitor" +variable "appgateway_unhealthy_host_ratio_extra_tags" { + description = "Extra tags for App Gateway unhealthy host ratio monitor" type = list(string) default = [] } -variable "appgateway_healthy_host_count_message" { - description = "Custom message for App Gateway healthy host monitor" +variable "appgateway_unhealthy_host_ratio_message" { + description = "Custom message for App Gateway unhealthy host ratio monitor" type = string default = "" } -variable "appgateway_healthy_host_count_time_aggregator" { - description = "Monitor aggregator for App Gateway healthy host [available values: min, max or avg]" +variable "appgateway_unhealthy_host_ratio_time_aggregator" { + description = "Monitor aggregator for App Gateway unhealthy host ratio [available values: min, max or avg]" type = string default = "max" } -variable "appgateway_healthy_host_count_timeframe" { - description = "Monitor timeframe for App Gateway healthy host [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" +variable "appgateway_unhealthy_host_ratio_timeframe" { + description = "Monitor timeframe for App Gateway unhealthy host ratio [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" type = string default = "last_5m" } +variable "appgateway_unhealthy_host_ratio_threshold_critical" { + default = 75 + description = "Maximum critical acceptable ratio of unhealthy host" +} + +variable "appgateway_unhealthy_host_ratio_threshold_warning" { + default = 50 + description = "Warning regarding acceptable ratio of unhealthy host" +} + # Monitoring App Gateway response_status 4xx variable "appgateway_http_4xx_errors_enabled" { description = "Flag to enable App Gateway http 4xx errors monitor" diff --git a/cloud/azure/app-gateway/monitors-app_gateway.tf b/cloud/azure/app-gateway/monitors-app_gateway.tf index a204d96..4aff7ef 100644 --- a/cloud/azure/app-gateway/monitors-app_gateway.tf +++ b/cloud/azure/app-gateway/monitors-app_gateway.tf @@ -127,18 +127,25 @@ EOQ } } -# Monitoring App Gateway healthy_host_count -resource "datadog_monitor" "appgateway_healthy_host_count" { - count = var.appgateway_healthy_host_count_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway backend has no healthy host" - message = coalesce(var.appgateway_healthy_host_count_message, var.message) +# Monitoring App Gateway unhealthy_host_ratio +resource "datadog_monitor" "appgateway_healthy_host_ratio" { + count = var.appgateway_unhealthy_host_ratio_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway backend unhealthy host ratio is too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = coalesce(var.appgateway_unhealthy_host_ratio_message, var.message) type = "query alert" query = < ${var.appgateway_unhealthy_host_ratio_threshold_critical} EOQ + thresholds = { + critical = var.appgateway_unhealthy_host_ratio_threshold_critical + warning = var.appgateway_unhealthy_host_ratio_threshold_warning + } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_no_data = false @@ -149,7 +156,7 @@ EOQ locked = false require_full_window = false - tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-gateway", "team:claranet", "created-by:terraform"], var.appgateway_healthy_host_count_extra_tags) + tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-gateway", "team:claranet", "created-by:terraform"], var.appgateway_unhealthy_host_ratio_extra_tags) lifecycle { ignore_changes = ["silenced"] diff --git a/cloud/azure/app-gateway/outputs.tf b/cloud/azure/app-gateway/outputs.tf index 5c5c774..c672fed 100644 --- a/cloud/azure/app-gateway/outputs.tf +++ b/cloud/azure/app-gateway/outputs.tf @@ -18,9 +18,9 @@ output "appgateway_failed_requests_id" { value = datadog_monitor.appgateway_failed_requests.*.id } -output "appgateway_healthy_host_count_id" { - description = "id for monitor appgateway_healthy_host_count" - value = datadog_monitor.appgateway_healthy_host_count.*.id +output "appgateway_healthy_host_ratio_id" { + description = "id for monitor appgateway_healthy_host_ratio" + value = datadog_monitor.appgateway_healthy_host_ratio.*.id } output "appgateway_http_4xx_errors_id" { From 67897a7c746b5cd2528042b92525804154d4e080 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Fri, 8 Nov 2019 10:42:53 +0100 Subject: [PATCH 9/9] MON-366: Rename monitor file --- .../{monitors-app_gateway.tf => monitors-app-gateway.tf} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cloud/azure/app-gateway/{monitors-app_gateway.tf => monitors-app-gateway.tf} (100%) diff --git a/cloud/azure/app-gateway/monitors-app_gateway.tf b/cloud/azure/app-gateway/monitors-app-gateway.tf similarity index 100% rename from cloud/azure/app-gateway/monitors-app_gateway.tf rename to cloud/azure/app-gateway/monitors-app-gateway.tf