From 1de02e53d84f59cf9eeb9766a8854fb0f7d44366 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Tue, 5 Nov 2019 12:05:16 +0100 Subject: [PATCH] MON-366 Change static healthy host check to a ratio --- cloud/azure/app-gateway/README.md | 20 ++++++----- cloud/azure/app-gateway/inputs.tf | 36 ++++++++++++------- .../azure/app-gateway/monitors-app_gateway.tf | 23 +++++++----- cloud/azure/app-gateway/outputs.tf | 6 ++-- 4 files changed, 52 insertions(+), 33 deletions(-) diff --git a/cloud/azure/app-gateway/README.md b/cloud/azure/app-gateway/README.md index 1ec341d..23ed4a1 100644 --- a/cloud/azure/app-gateway/README.md +++ b/cloud/azure/app-gateway/README.md @@ -17,9 +17,9 @@ module "datadog-monitors-cloud-azure-app-gateway" { Creates DataDog monitors with the following checks: - App Gateway backend connect time is too high -- App Gateway backend has no healthy host - App Gateway backend HTTP 4xx errors rate is too high - App Gateway backend HTTP 5xx errors rate is too high +- App Gateway backend unhealthy host ratio is too high - App Gateway failed requests - App Gateway has no connection - App Gateway HTTP 4xx errors rate is too high @@ -33,8 +33,8 @@ Creates DataDog monitors with the following checks: | appgateway\_backend\_connect\_time\_enabled | Flag to enable App Gateway backend_connect_time monitor | string | `"true"` | no | | appgateway\_backend\_connect\_time\_extra\_tags | Extra tags for App Gateway backend_connect_time monitor | list(string) | `[]` | no | | appgateway\_backend\_connect\_time\_message | Custom message for App Gateway backend_connect_time monitor | string | `""` | no | -| appgateway\_backend\_connect\_time\_threshold\_critical | Maximum critical backend_connect_time errors in seconds | string | `"50"` | no | -| appgateway\_backend\_connect\_time\_threshold\_warning | Warning regarding backend_connect_time errors in seconds | string | `"40"` | no | +| appgateway\_backend\_connect\_time\_threshold\_critical | Maximum critical backend_connect_time errors in milliseconds | string | `"50"` | no | +| appgateway\_backend\_connect\_time\_threshold\_warning | Warning regarding backend_connect_time errors in milliseconds | string | `"40"` | no | | appgateway\_backend\_connect\_time\_time\_aggregator | Monitor aggregator for App Gateway backend_connect_time [available values: min, max or avg] | string | `"max"` | no | | appgateway\_backend\_connect\_time\_timeframe | Monitor timeframe for App Gateway backend_connect_time [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | appgateway\_backend\_http\_4xx\_errors\_enabled | Flag to enable App Gateway http 4xx errors monitor | string | `"true"` | no | @@ -58,11 +58,6 @@ Creates DataDog monitors with the following checks: | appgateway\_failed\_requests\_threshold\_warning | Warning regarding acceptable percent of failed errors | string | `"80"` | no | | appgateway\_failed\_requests\_time\_aggregator | Monitor aggregator for App Gateway failed requests [available values: min, max or avg] | string | `"min"` | no | | appgateway\_failed\_requests\_timeframe | Monitor timeframe for App Gateway failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | -| appgateway\_healthy\_host\_count\_enabled | Flag to enable App Gateway healthy host monitor | string | `"true"` | no | -| appgateway\_healthy\_host\_count\_extra\_tags | Extra tags for App Gateway healthy host monitor | list(string) | `[]` | no | -| appgateway\_healthy\_host\_count\_message | Custom message for App Gateway healthy host monitor | string | `""` | no | -| appgateway\_healthy\_host\_count\_time\_aggregator | Monitor aggregator for App Gateway healthy host [available values: min, max or avg] | string | `"max"` | no | -| appgateway\_healthy\_host\_count\_timeframe | Monitor timeframe for App Gateway healthy host [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | appgateway\_http\_4xx\_errors\_enabled | Flag to enable App Gateway http 4xx errors monitor | string | `"true"` | no | | appgateway\_http\_4xx\_errors\_extra\_tags | Extra tags for App Gateway http 4xx errors monitor | list(string) | `[]` | no | | appgateway\_http\_4xx\_errors\_message | Custom message for App Gateway http 4xx errors monitor | string | `""` | no | @@ -77,6 +72,13 @@ Creates DataDog monitors with the following checks: | appgateway\_http\_5xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 5xx error | string | `"80"` | no | | appgateway\_http\_5xx\_errors\_time\_aggregator | Monitor aggregator for App Gateway http 5xx errors [available values: min, max or avg] | string | `"max"` | no | | appgateway\_http\_5xx\_errors\_timeframe | Monitor timeframe for App Gateway http 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| appgateway\_unhealthy\_host\_ratio\_enabled | Flag to enable App Gateway unhealthy host ratio monitor | string | `"true"` | no | +| appgateway\_unhealthy\_host\_ratio\_extra\_tags | Extra tags for App Gateway unhealthy host ratio monitor | list(string) | `[]` | no | +| appgateway\_unhealthy\_host\_ratio\_message | Custom message for App Gateway unhealthy host ratio monitor | string | `""` | no | +| appgateway\_unhealthy\_host\_ratio\_threshold\_critical | Maximum critical acceptable ratio of unhealthy host | string | `"75"` | no | +| appgateway\_unhealthy\_host\_ratio\_threshold\_warning | Warning regarding acceptable ratio of unhealthy host | string | `"50"` | no | +| appgateway\_unhealthy\_host\_ratio\_time\_aggregator | Monitor aggregator for App Gateway unhealthy host ratio [available values: min, max or avg] | string | `"max"` | no | +| appgateway\_unhealthy\_host\_ratio\_timeframe | Monitor timeframe for App Gateway unhealthy host ratio [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | current\_connection\_enabled | Flag to enable App Gateway current connections monitor | string | `"true"` | no | | current\_connection\_extra\_tags | Extra tags for App Gateway current connections monitor | list(string) | `[]` | no | | current\_connection\_message | Custom message for App Gateway current connections monitor | string | `""` | no | @@ -104,7 +106,7 @@ Creates DataDog monitors with the following checks: | appgateway\_backend\_http\_4xx\_errors\_id | id for monitor appgateway_backend_http_4xx_errors | | appgateway\_backend\_http\_5xx\_errors\_id | id for monitor appgateway_backend_http_5xx_errors | | appgateway\_failed\_requests\_id | id for monitor appgateway_failed_requests | -| appgateway\_healthy\_host\_count\_id | id for monitor appgateway_healthy_host_count | +| appgateway\_healthy\_host\_ratio\_id | id for monitor appgateway_healthy_host_ratio | | appgateway\_http\_4xx\_errors\_id | id for monitor appgateway_http_4xx_errors | | appgateway\_http\_5xx\_errors\_id | id for monitor appgateway_http_5xx_errors | | appgateway\_status\_id | id for monitor appgateway_status | diff --git a/cloud/azure/app-gateway/inputs.tf b/cloud/azure/app-gateway/inputs.tf index 1040aad..91d338e 100644 --- a/cloud/azure/app-gateway/inputs.tf +++ b/cloud/azure/app-gateway/inputs.tf @@ -134,12 +134,12 @@ variable "appgateway_backend_connect_time_timeframe" { variable "appgateway_backend_connect_time_threshold_critical" { default = 50 - description = "Maximum critical backend_connect_time errors in seconds" + description = "Maximum critical backend_connect_time errors in milliseconds" } variable "appgateway_backend_connect_time_threshold_warning" { default = 40 - description = "Warning regarding backend_connect_time errors in seconds" + description = "Warning regarding backend_connect_time errors in milliseconds" } # Monitoring App Gateway failed_requests @@ -183,37 +183,47 @@ variable "appgateway_failed_requests_threshold_warning" { description = "Warning regarding acceptable percent of failed errors" } -# Monitoring App Gateway healthy_host_count -variable "appgateway_healthy_host_count_enabled" { - description = "Flag to enable App Gateway healthy host monitor" +# Monitoring App Gateway unhealthy_host_ratio +variable "appgateway_unhealthy_host_ratio_enabled" { + description = "Flag to enable App Gateway unhealthy host ratio monitor" type = string default = "true" } -variable "appgateway_healthy_host_count_extra_tags" { - description = "Extra tags for App Gateway healthy host monitor" +variable "appgateway_unhealthy_host_ratio_extra_tags" { + description = "Extra tags for App Gateway unhealthy host ratio monitor" type = list(string) default = [] } -variable "appgateway_healthy_host_count_message" { - description = "Custom message for App Gateway healthy host monitor" +variable "appgateway_unhealthy_host_ratio_message" { + description = "Custom message for App Gateway unhealthy host ratio monitor" type = string default = "" } -variable "appgateway_healthy_host_count_time_aggregator" { - description = "Monitor aggregator for App Gateway healthy host [available values: min, max or avg]" +variable "appgateway_unhealthy_host_ratio_time_aggregator" { + description = "Monitor aggregator for App Gateway unhealthy host ratio [available values: min, max or avg]" type = string default = "max" } -variable "appgateway_healthy_host_count_timeframe" { - description = "Monitor timeframe for App Gateway healthy host [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" +variable "appgateway_unhealthy_host_ratio_timeframe" { + description = "Monitor timeframe for App Gateway unhealthy host ratio [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" type = string default = "last_5m" } +variable "appgateway_unhealthy_host_ratio_threshold_critical" { + default = 75 + description = "Maximum critical acceptable ratio of unhealthy host" +} + +variable "appgateway_unhealthy_host_ratio_threshold_warning" { + default = 50 + description = "Warning regarding acceptable ratio of unhealthy host" +} + # Monitoring App Gateway response_status 4xx variable "appgateway_http_4xx_errors_enabled" { description = "Flag to enable App Gateway http 4xx errors monitor" diff --git a/cloud/azure/app-gateway/monitors-app_gateway.tf b/cloud/azure/app-gateway/monitors-app_gateway.tf index a204d96..4aff7ef 100644 --- a/cloud/azure/app-gateway/monitors-app_gateway.tf +++ b/cloud/azure/app-gateway/monitors-app_gateway.tf @@ -127,18 +127,25 @@ EOQ } } -# Monitoring App Gateway healthy_host_count -resource "datadog_monitor" "appgateway_healthy_host_count" { - count = var.appgateway_healthy_host_count_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway backend has no healthy host" - message = coalesce(var.appgateway_healthy_host_count_message, var.message) +# Monitoring App Gateway unhealthy_host_ratio +resource "datadog_monitor" "appgateway_healthy_host_ratio" { + count = var.appgateway_unhealthy_host_ratio_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway backend unhealthy host ratio is too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = coalesce(var.appgateway_unhealthy_host_ratio_message, var.message) type = "query alert" query = < ${var.appgateway_unhealthy_host_ratio_threshold_critical} EOQ + thresholds = { + critical = var.appgateway_unhealthy_host_ratio_threshold_critical + warning = var.appgateway_unhealthy_host_ratio_threshold_warning + } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_no_data = false @@ -149,7 +156,7 @@ EOQ locked = false require_full_window = false - tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-gateway", "team:claranet", "created-by:terraform"], var.appgateway_healthy_host_count_extra_tags) + tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-gateway", "team:claranet", "created-by:terraform"], var.appgateway_unhealthy_host_ratio_extra_tags) lifecycle { ignore_changes = ["silenced"] diff --git a/cloud/azure/app-gateway/outputs.tf b/cloud/azure/app-gateway/outputs.tf index 5c5c774..c672fed 100644 --- a/cloud/azure/app-gateway/outputs.tf +++ b/cloud/azure/app-gateway/outputs.tf @@ -18,9 +18,9 @@ output "appgateway_failed_requests_id" { value = datadog_monitor.appgateway_failed_requests.*.id } -output "appgateway_healthy_host_count_id" { - description = "id for monitor appgateway_healthy_host_count" - value = datadog_monitor.appgateway_healthy_host_count.*.id +output "appgateway_healthy_host_ratio_id" { + description = "id for monitor appgateway_healthy_host_ratio" + value = datadog_monitor.appgateway_healthy_host_ratio.*.id } output "appgateway_http_4xx_errors_id" {