diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index e56fac2..dc9e526 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -19,9 +19,9 @@ Creates a DataDog monitors with the following checks : * Response time * Memory usage count -* HTTP 404 errors -* HTTP 50x errors -* HTTP 20x rate +* HTTP 5xx requests +* HTTP 4xx requests +* HTTP 2xx requests Inputs ------ @@ -32,12 +32,12 @@ Inputs | environment | Architecture environment | string | - | yes | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | -| http_2xx_status_rate_limit | | string | `30` | no | -| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | -| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | -| http_404_errors_count_rate_limit | | string | `30` | no | -| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | -| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | +| http_2xx_requests_threshold_critical | Minimum critical acceptable percent of 2xx requests | string | `90` | no | +| http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no | +| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no | +| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no | +| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no | +| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | | message | Message sent when a monitor is triggered | string | - | yes | diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index c4bc451..3085251 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -51,37 +51,43 @@ variable "memory_usage_threshold_warning" { } ################################# -### HTTP 404 status pages ### +### HTTP 5xx status pages ### ################################# -variable "http_404_errors_count_rate_limit" { - default = 30 +variable "http_5xx_requests_threshold_critical" { + default = 20 + description = "Maximum critical acceptable percent of 5xx errors" } -variable "http_404_errors_count_rate_threshold_critical" { - default = 30 - description = "Alerting threshold (number of requests)" -} - -variable "http_404_errors_count_rate_threshold_warning" { +variable "http_5xx_requests_threshold_warning" { default = 10 - description = "Warning threshold (number of requests)" + description = "Maximum warning acceptable percent of 5xx errors" } ################################# -### HTTP 202 status pages ### +### HTTP 4xx status pages ### ################################# -variable "http_2xx_status_rate_limit" { - default = 30 +variable "http_4xx_requests_threshold_critical" { + default = 30 + description = "Maximum critical acceptable percent of 4xx errors" } -variable "http_2xx_status_rate_threshold_critical" { - default = 0.9 - description = "Alerting threshold (percentage)" +variable "http_4xx_requests_threshold_warning" { + default = 15 + description = "Maximum warning acceptable percent of 4xx errors" } -variable "http_2xx_status_rate_threshold_warning" { - default = 0.95 - description = "Warning threshold (percentage)" +################################# +### HTTP 2xx status pages ### +################################# + +variable "http_2xx_requests_threshold_critical" { + default = 90 + description = "Minimum critical acceptable percent of 2xx requests" +} + +variable "http_2xx_requests_threshold_warning" { + default = 95 + description = "Minimum warning acceptable percent of 2xx requests" } diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 1cff1af..02cf2d9 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -8,14 +8,14 @@ data "template_file" "filter" { # Monitoring App Services response time resource "datadog_monitor" "appservices_response_time" { - name = "[${var.environment}] App Services response time > ${var.response_time_threshold_critical}s on {{name}}" + name = "[${var.environment}] App Services response time of {{value}}s is to high on {{name}}" type = "metric alert" message = "${var.message}" query = <= ${var.response_time_threshold_critical} + avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}} by {resource_group,region,name} + ) > ${var.response_time_threshold_critical} EOF evaluation_delay = "${var.delay}" @@ -43,8 +43,8 @@ resource "datadog_monitor" "appservices_memory_usage_count" { query = <= ${var.memory_usage_threshold_critical} + avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}} by {resource_group,region,name} + ) > ${var.memory_usage_threshold_critical} EOF evaluation_delay = "${var.delay}" @@ -64,24 +64,25 @@ resource "datadog_monitor" "appservices_memory_usage_count" { tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } -# Monitoring App Services 404 errors rate -resource "datadog_monitor" "appservices_http_404_errors_count" { - name = "[${var.environment}] App Services HTTP errors > ${var.http_404_errors_count_rate_limit} limit on {{name}}" +# Monitoring App Services 5xx errors percent +resource "datadog_monitor" "appservices_http_5xx_errors_count" { + name = "[${var.environment}] App Services HTTP 5xx errors is {{value}}% above the limit on {{name}}" type = "metric alert" message = "${var.message}" query = < ${var.http_404_errors_count_rate_threshold_critical} + sum(last_5m): ( + avg:azure.app_services.http5xx{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() / + avg:azure.app_services.requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + ) * 100 > ${var.http_5xx_requests_threshold_critical} EOF evaluation_delay = "${var.delay}" new_host_delay = "${var.delay}" thresholds { - warning = "${var.http_404_errors_count_rate_threshold_warning}" - critical = "${var.http_404_errors_count_rate_threshold_critical}" + warning = "${var.http_5xx_requests_threshold_warning}" + critical = "${var.http_5xx_requests_threshold_critical}" } notify_no_data = false # Will NOT notify when no data is received @@ -93,28 +94,58 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } -# Monitoring App Services HTTP 2xx status pages rate -resource "datadog_monitor" "appservices_http_2xx_status_rate" { - name = "[${var.environment}] App Services Too much non 2xx HTTP status in response to the requests on {{name}}" +# Monitoring App Services 4xx errors percent +resource "datadog_monitor" "appservices_http_4xx_errors_count" { + name = "[${var.environment}] App Services HTTP 4xx errors is {{value}}% above the limit on {{name}}" type = "metric alert" message = "${var.message}" query = < ${var.http_4xx_requests_threshold_critical} EOF evaluation_delay = "${var.delay}" new_host_delay = "${var.delay}" thresholds { - warning = "${var.http_2xx_status_rate_threshold_warning}" - critical = "${var.http_2xx_status_rate_threshold_critical}" + warning = "${var.http_4xx_requests_threshold_warning}" + critical = "${var.http_4xx_requests_threshold_critical}" } - notify_no_data = true # Will notify when no data is received + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = 0 + require_full_window = true + timeout_h = 0 + include_tags = true + + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] +} + +# Monitoring App Services HTTP 2xx status pages percent +resource "datadog_monitor" "appservices_http_2xx_status_rate" { + name = "[${var.environment}] App Services HTTP 2xx responses is {{value}}% below the limit on {{name}}" + type = "metric alert" + message = "${var.message}" + + query = <