diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index ab49366..b439492 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -19,6 +19,7 @@ Creates a DataDog monitors with the following checks : * Response time * Memory usage count +* HTTP 5xx requests * HTTP 404 requests * HTTP 2xx requests @@ -35,6 +36,8 @@ Inputs | http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no | | http_404_requests_threshold_critical | Maximum critical acceptable percent of 404 errors | string | `40` | no | | http_404_requests_threshold_warning | Maximum warning acceptable percent of 404 errors | string | `30` | no | +| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `40` | no | +| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `30` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | | message | Message sent when a monitor is triggered | string | - | yes | diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 96c2892..bc50156 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -50,6 +50,20 @@ variable "memory_usage_threshold_warning" { description = "Warning threshold in MiB" } +################################# +### HTTP 5xx status pages ### +################################# + +variable "http_5xx_requests_threshold_critical" { + default = 20 + description = "Maximum critical acceptable percent of 5xx errors" +} + +variable "http_5xx_requests_threshold_warning" { + default = 10 + description = "Maximum warning acceptable percent of 5xx errors" +} + ################################# ### HTTP 404 status pages ### ################################# diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index d473fd5..3f8b49b 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -64,9 +64,39 @@ resource "datadog_monitor" "appservices_memory_usage_count" { tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } -# Monitoring App Services 404 errors rate +# Monitoring App Services 5xx errors percent +resource "datadog_monitor" "appservices_http_5xx_errors_count" { + name = "[${var.environment}] App Services HTTP 5xx errors is {{value}}% above the limit on {{name}}" + type = "metric alert" + message = "${var.message}" + + query = < ${var.http_5xx_requests_threshold_critical} + EOF + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + thresholds { + warning = "${var.http_5xx_requests_threshold_warning}" + critical = "${var.http_5xx_requests_threshold_critical}" + } + + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = 0 + require_full_window = true + timeout_h = 0 + include_tags = true + + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] +} + +# Monitoring App Services 404 errors percent resource "datadog_monitor" "appservices_http_404_errors_count" { - name = "[${var.environment}] App Services HTTP errors > {{value}}% limit on {{name}}" + name = "[${var.environment}] App Services HTTP 404 errors is {{value}}% above the limit on {{name}}" type = "metric alert" message = "${var.message}" @@ -94,9 +124,9 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } -# Monitoring App Services HTTP 2xx status pages rate +# Monitoring App Services HTTP 2xx status pages percent resource "datadog_monitor" "appservices_http_2xx_status_rate" { - name = "[${var.environment}] App Services Too much non 2xx HTTP status in response to the requests on {{name}}" + name = "[${var.environment}] App Services HTTP 2xx responses is {{value}}% below the limit on {{name}}" type = "metric alert" message = "${var.message}"