From 115b8e0bf3c9556acf690c5a0ee5600caa74d95c Mon Sep 17 00:00:00 2001 From: "gauthier.ampe@fr.clara.net" Date: Fri, 20 Sep 2019 16:44:12 +0200 Subject: [PATCH] MON-366 Add app-gateway monitors --- README.md | 1 + cloud/azure/app-gateway/README.md | 87 ++++++ cloud/azure/app-gateway/inputs.tf | 255 ++++++++++++++++++ cloud/azure/app-gateway/modules.tf | 31 +++ .../app-gateway/monitors-app_services.tf | 190 +++++++++++++ cloud/azure/app-gateway/outputs.tf | 30 +++ 6 files changed, 594 insertions(+) create mode 100644 cloud/azure/app-gateway/README.md create mode 100644 cloud/azure/app-gateway/inputs.tf create mode 100644 cloud/azure/app-gateway/modules.tf create mode 100644 cloud/azure/app-gateway/monitors-app_services.tf create mode 100644 cloud/azure/app-gateway/outputs.tf diff --git a/README.md b/README.md index 964c4a2..e7bfce6 100644 --- a/README.md +++ b/README.md @@ -161,6 +161,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi - [vpn](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/vpn/) - [azure](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/azure/) - [apimanagement](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/azure/apimanagement/) + - [app-gateway](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/azure/app-gateway/) - [app-services](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/azure/app-services/) - [azure-search](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/azure/azure-search/) - [cosmosdb](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/azure/cosmosdb/) diff --git a/cloud/azure/app-gateway/README.md b/cloud/azure/app-gateway/README.md new file mode 100644 index 0000000..2333b2d --- /dev/null +++ b/cloud/azure/app-gateway/README.md @@ -0,0 +1,87 @@ +# CLOUD AZURE APP-GATEWAY DataDog monitors + +## How to use this module + +``` +module "datadog-monitors-cloud-azure-app-gateway" { + source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/azure/app-gateway?ref={revision}" + + environment = var.environment + message = module.datadog-message-alerting.alerting-message +} + +``` + +## Purpose + +Creates DataDog monitors with the following checks: + +- App Gateway failed requests (disabled by default) +- App Gateway HTTP 4xx errors too high +- App Gateway HTTP 5xx errors too high +- App Gateway is down +- App Gateway no connection (disabled by default) +- App Gateway no healthy host + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| appgateway\_failed\_requests\_enabled | Flag to enable App Gateway failed requests monitor | string | `"false"` | no | +| appgateway\_failed\_requests\_extra\_tags | Extra tags for App Gateway failed requests monitor | list(string) | `[]` | no | +| appgateway\_failed\_requests\_message | Custom message for App Gateway failed requests monitor | string | `""` | no | +| appgateway\_failed\_requests\_threshold\_critical | Maximum critical acceptable percent of failed errors | string | `"100"` | no | +| appgateway\_failed\_requests\_threshold\_warning | Warning regarding acceptable percent of failed errors | string | `"200"` | no | +| appgateway\_failed\_requests\_time\_aggregator | Monitor aggregator for App Gateway failed requests [available values: min, max or avg] | string | `"min"` | no | +| appgateway\_failed\_requests\_timeframe | Monitor timeframe for App Gateway failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| appgateway\_healthy\_host\_count\_enabled | Flag to enable App Gateway healthy host monitor | string | `"true"` | no | +| appgateway\_healthy\_host\_count\_extra\_tags | Extra tags for App Gateway healthy host monitor | list(string) | `[]` | no | +| appgateway\_healthy\_host\_count\_message | Custom message for App Gateway healthy host monitor | string | `""` | no | +| appgateway\_healthy\_host\_count\_time\_aggregator | Monitor aggregator for App Gateway healthy host [available values: min, max or avg] | string | `"min"` | no | +| appgateway\_healthy\_host\_count\_timeframe | Monitor timeframe for App Gateway healthy host [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| appgateway\_http\_4xx\_errors\_enabled | Flag to enable App Gateway http 4xx errors monitor | string | `"true"` | no | +| appgateway\_http\_4xx\_errors\_extra\_tags | Extra tags for App Gateway http 4xx errors monitor | list(string) | `[]` | no | +| appgateway\_http\_4xx\_errors\_message | Custom message for App Gateway http 4xx errors monitor | string | `""` | no | +| appgateway\_http\_4xx\_errors\_threshold\_critical | Minimum critical acceptable percent of 4xx error | string | `"30"` | no | +| appgateway\_http\_4xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 4xx error | string | `"20"` | no | +| appgateway\_http\_4xx\_errors\_time\_aggregator | Monitor aggregator for App Gateway http 4xx errors [available values: min, max or avg] | string | `"max"` | no | +| appgateway\_http\_4xx\_errors\_timeframe | Monitor timeframe for App Gateway http 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| appgateway\_http\_5xx\_errors\_enabled | Flag to enable App Gateway http 5xx errors monitor | string | `"true"` | no | +| appgateway\_http\_5xx\_errors\_extra\_tags | Extra tags for App Gateway http 5xx errors monitor | list(string) | `[]` | no | +| appgateway\_http\_5xx\_errors\_message | Custom message for App Gateway http 5xx errors monitor | string | `""` | no | +| appgateway\_http\_5xx\_errors\_threshold\_critical | Minimum critical acceptable percent of 5xx error | string | `"30"` | no | +| appgateway\_http\_5xx\_errors\_threshold\_warning | Warning regarding acceptable percent of 5xx error | string | `"20"` | no | +| appgateway\_http\_5xx\_errors\_time\_aggregator | Monitor aggregator for App Gateway http 5xx errors [available values: min, max or avg] | string | `"max"` | no | +| appgateway\_http\_5xx\_errors\_timeframe | Monitor timeframe for App Gateway http 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| current\_connection\_enabled | Flag to enable App Gateway current connections monitor | string | `"false"` | no | +| current\_connection\_extra\_tags | Extra tags for App Gateway current connections monitor | list(string) | `[]` | no | +| current\_connection\_message | Custom message for App Gateway current connections monitor | string | `""` | no | +| current\_connection\_time\_aggregator | Monitor aggregator for App Gateway current connections [available values: min, max or avg] | string | `"min"` | no | +| current\_connection\_timeframe | Monitor timeframe for App Gateway current connections [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| environment | Architecture environment | string | n/a | yes | +| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no | +| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no | +| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | +| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | +| message | Message sent when a monitor is triggered | string | n/a | yes | +| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | +| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no | +| status\_enabled | Flag to enable App Gateway status | string | `"true"` | no | +| status\_extra\_tags | Extra tags for App Gateway status | list(string) | `[]` | no | +| status\_message | Custom message for App Gateway status | string | `""` | no | +| status\_time\_aggregator | Monitor aggregator for App Gateway status [available values: min, max or avg] | string | `"min"` | no | +| status\_timeframe | Monitor timeframe for App Gateway status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| appgateway\_failed\_requests\_id | id for monitor appgateway_failed_requests | +| appgateway\_healthy\_host\_count\_id | id for monitor appgateway_healthy_host_count | +| appgateway\_http\_4xx\_errors\_id | id for monitor appgateway_http_4xx_errors | +| appgateway\_http\_5xx\_errors\_id | id for monitor appgateway_http_5xx_errors | +| appgateway\_status\_id | id for monitor appgateway_status | +| current\_connection\_id | id for monitor current_connection | + +## Related documentation + diff --git a/cloud/azure/app-gateway/inputs.tf b/cloud/azure/app-gateway/inputs.tf new file mode 100644 index 0000000..8cdb5a4 --- /dev/null +++ b/cloud/azure/app-gateway/inputs.tf @@ -0,0 +1,255 @@ +variable "environment" { + description = "Architecture environment" + type = string +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "filter_tags_custom_excluded" { + description = "Tags excluded for custom filtering when filter_tags_use_defaults is false" + default = "" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +variable "new_host_delay" { + description = "Delay in seconds before monitor new resource" + default = 300 +} + +variable "prefix_slug" { + description = "Prefix string to prepend between brackets on every monitors names" + default = "" +} + +# Azure App Gateway specific variables +# Monitoring App Gateway status +variable "status_enabled" { + description = "Flag to enable App Gateway status" + type = string + default = "true" +} + +variable "status_extra_tags" { + description = "Extra tags for App Gateway status" + type = list(string) + default = [] +} + +variable "status_message" { + description = "Custom message for App Gateway status" + type = string + default = "" +} + +variable "status_time_aggregator" { + description = "Monitor aggregator for App Gateway status [available values: min, max or avg]" + type = string + default = "min" +} + +variable "status_timeframe" { + description = "Monitor timeframe for App Gateway status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +# Monitoring App Gateway current_connections (count) +variable "current_connection_enabled" { + description = "Flag to enable App Gateway current connections monitor" + type = string + default = "false" +} + +variable "current_connection_extra_tags" { + description = "Extra tags for App Gateway current connections monitor" + type = list(string) + default = [] +} + +variable "current_connection_message" { + description = "Custom message for App Gateway current connections monitor" + type = string + default = "" +} + +variable "current_connection_time_aggregator" { + description = "Monitor aggregator for App Gateway current connections [available values: min, max or avg]" + type = string + default = "min" +} + +variable "current_connection_timeframe" { + description = "Monitor timeframe for App Gateway current connections [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +# Monitoring App Gateway failed_requests (count) +variable "appgateway_failed_requests_enabled" { + description = "Flag to enable App Gateway failed requests monitor" + type = string + default = "false" +} + +variable "appgateway_failed_requests_extra_tags" { + description = "Extra tags for App Gateway failed requests monitor" + type = list(string) + default = [] +} + +variable "appgateway_failed_requests_message" { + description = "Custom message for App Gateway failed requests monitor" + type = string + default = "" +} + +variable "appgateway_failed_requests_time_aggregator" { + description = "Monitor aggregator for App Gateway failed requests [available values: min, max or avg]" + type = string + default = "min" +} + +variable "appgateway_failed_requests_timeframe" { + description = "Monitor timeframe for App Gateway failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +variable "appgateway_failed_requests_threshold_critical" { + default = 100 + description = "Maximum critical acceptable percent of failed errors" +} + +variable "appgateway_failed_requests_threshold_warning" { + default = 200 + description = "Warning regarding acceptable percent of failed errors" +} + +# Monitoring App Gateway healthy_host_count (count) +variable "appgateway_healthy_host_count_enabled" { + description = "Flag to enable App Gateway healthy host monitor" + type = string + default = "true" +} + +variable "appgateway_healthy_host_count_extra_tags" { + description = "Extra tags for App Gateway healthy host monitor" + type = list(string) + default = [] +} + +variable "appgateway_healthy_host_count_message" { + description = "Custom message for App Gateway healthy host monitor" + type = string + default = "" +} + +variable "appgateway_healthy_host_count_time_aggregator" { + description = "Monitor aggregator for App Gateway healthy host [available values: min, max or avg]" + type = string + default = "min" +} + +variable "appgateway_healthy_host_count_timeframe" { + description = "Monitor timeframe for App Gateway healthy host [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +# Monitoring App Gateway response_status 4xx (count) +variable "appgateway_http_4xx_errors_enabled" { + description = "Flag to enable App Gateway http 4xx errors monitor" + type = string + default = "true" +} + +variable "appgateway_http_4xx_errors_extra_tags" { + description = "Extra tags for App Gateway http 4xx errors monitor" + type = list(string) + default = [] +} + +variable "appgateway_http_4xx_errors_message" { + description = "Custom message for App Gateway http 4xx errors monitor" + type = string + default = "" +} + +variable "appgateway_http_4xx_errors_time_aggregator" { + description = "Monitor aggregator for App Gateway http 4xx errors [available values: min, max or avg]" + type = string + default = "max" +} + +variable "appgateway_http_4xx_errors_timeframe" { + description = "Monitor timeframe for App Gateway http 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +variable "appgateway_http_4xx_errors_threshold_critical" { + default = 30 + description = "Minimum critical acceptable percent of 4xx error" +} + +variable "appgateway_http_4xx_errors_threshold_warning" { + default = 20 + description = "Warning regarding acceptable percent of 4xx error" +} + +# Monitoring App Gateway response_status 5xx (count) +variable "appgateway_http_5xx_errors_enabled" { + description = "Flag to enable App Gateway http 5xx errors monitor" + type = string + default = "true" +} + +variable "appgateway_http_5xx_errors_extra_tags" { + description = "Extra tags for App Gateway http 5xx errors monitor" + type = list(string) + default = [] +} + +variable "appgateway_http_5xx_errors_message" { + description = "Custom message for App Gateway http 5xx errors monitor" + type = string + default = "" +} + +variable "appgateway_http_5xx_errors_time_aggregator" { + description = "Monitor aggregator for App Gateway http 5xx errors [available values: min, max or avg]" + type = string + default = "max" +} + +variable "appgateway_http_5xx_errors_timeframe" { + description = "Monitor timeframe for App Gateway http 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +variable "appgateway_http_5xx_errors_threshold_critical" { + default = 30 + description = "Minimum critical acceptable percent of 5xx error" +} + +variable "appgateway_http_5xx_errors_threshold_warning" { + default = 20 + description = "Warning regarding acceptable percent of 5xx error" +} diff --git a/cloud/azure/app-gateway/modules.tf b/cloud/azure/app-gateway/modules.tf new file mode 100644 index 0000000..f2595dc --- /dev/null +++ b/cloud/azure/app-gateway/modules.tf @@ -0,0 +1,31 @@ +module "filter-tags" { + source = "../../../common/filter-tags" + + environment = var.environment + resource = "azure_app-gateway" + filter_tags_use_defaults = var.filter_tags_use_defaults + filter_tags_custom = var.filter_tags_custom + filter_tags_custom_excluded = var.filter_tags_custom_excluded +} + +module "filter-tags-4xx-error" { + source = "../../../common/filter-tags" + + environment = var.environment + resource = "azure_app-gateway" + filter_tags_use_defaults = var.filter_tags_use_defaults + filter_tags_custom = var.filter_tags_custom + filter_tags_custom_excluded = var.filter_tags_custom_excluded + extra_tags = ["httpstatus:5xx"] +} + +module "filter-tags-5xx-error" { + source = "../../../common/filter-tags" + + environment = var.environment + resource = "azure_app-gateway" + filter_tags_use_defaults = var.filter_tags_use_defaults + filter_tags_custom = var.filter_tags_custom + filter_tags_custom_excluded = var.filter_tags_custom_excluded + extra_tags = ["httpstatus:5xx"] +} diff --git a/cloud/azure/app-gateway/monitors-app_services.tf b/cloud/azure/app-gateway/monitors-app_services.tf new file mode 100644 index 0000000..bdfb76b --- /dev/null +++ b/cloud/azure/app-gateway/monitors-app_services.tf @@ -0,0 +1,190 @@ +# Monitoring App Gateway status +resource "datadog_monitor" "appgateway_status" { + count = var.status_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway is down" + message = coalesce(var.status_message, var.message) + type = "query alert" + + query = < ${var.appgateway_failed_requests_threshold_critical} +EOQ + + thresholds = { + critical = var.appgateway_failed_requests_threshold_critical + warning = var.appgateway_failed_requests_threshold_warning + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + notify_no_data = false + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = false + + tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-gateway", "team:claranet", "created-by:terraform"], var.appgateway_failed_requests_extra_tags) + + lifecycle { + ignore_changes = ["silenced"] + } +} + +# Monitoring App Gateway healthy_host_count (count) +resource "datadog_monitor" "appgateway_healthy_host_count" { + count = var.appgateway_healthy_host_count_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway no healthy host {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = coalesce(var.appgateway_healthy_host_count_message, var.message) + type = "query alert" + + query = < ${var.appgateway_http_4xx_errors_threshold_critical} +EOQ + + + thresholds = { + warning = var.appgateway_http_4xx_errors_threshold_warning + critical = var.appgateway_http_4xx_errors_threshold_critical + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + notify_no_data = false + renotify_interval = 0 + require_full_window = false + timeout_h = 1 + include_tags = true + + tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-gateway", "team:claranet", "created-by:terraform"], var.appgateway_http_4xx_errors_extra_tags) + + lifecycle { + ignore_changes = ["silenced"] + } +} + +# Monitoring App Gateway response_status 5xx (count) +resource "datadog_monitor" "appgateway_http_5xx_errors" { + count = var.appgateway_http_5xx_errors_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Gateway HTTP 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = coalesce(var.appgateway_http_5xx_errors_message, var.message) + type = "query alert" + + query = < ${var.appgateway_http_5xx_errors_threshold_critical} +EOQ + + thresholds = { + warning = var.appgateway_http_5xx_errors_threshold_warning + critical = var.appgateway_http_5xx_errors_threshold_critical + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + notify_no_data = false + renotify_interval = 0 + require_full_window = false + timeout_h = 1 + include_tags = true + + tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-gateway", "team:claranet", "created-by:terraform"], var.appgateway_http_5xx_errors_extra_tags) + + lifecycle { + ignore_changes = ["silenced"] + } +} diff --git a/cloud/azure/app-gateway/outputs.tf b/cloud/azure/app-gateway/outputs.tf new file mode 100644 index 0000000..a50ee60 --- /dev/null +++ b/cloud/azure/app-gateway/outputs.tf @@ -0,0 +1,30 @@ +output "appgateway_failed_requests_id" { + description = "id for monitor appgateway_failed_requests" + value = datadog_monitor.appgateway_failed_requests.*.id +} + +output "appgateway_healthy_host_count_id" { + description = "id for monitor appgateway_healthy_host_count" + value = datadog_monitor.appgateway_healthy_host_count.*.id +} + +output "appgateway_http_4xx_errors_id" { + description = "id for monitor appgateway_http_4xx_errors" + value = datadog_monitor.appgateway_http_4xx_errors.*.id +} + +output "appgateway_http_5xx_errors_id" { + description = "id for monitor appgateway_http_5xx_errors" + value = datadog_monitor.appgateway_http_5xx_errors.*.id +} + +output "appgateway_status_id" { + description = "id for monitor appgateway_status" + value = datadog_monitor.appgateway_status.*.id +} + +output "current_connection_id" { + description = "id for monitor current_connection" + value = datadog_monitor.current_connection.*.id +} +