From 178813dd5c9ff9751e5c2ee113a31bb888c02d21 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Sun, 26 Nov 2017 20:26:18 +0100 Subject: [PATCH 01/10] MON-74 convert all as_count queries to sum --- cloud/azure/app-services/monitors-app_services.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 1cff1af..2c2f80e 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -100,7 +100,7 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { message = "${var.message}" query = < Date: Mon, 27 Nov 2017 23:00:12 +0100 Subject: [PATCH 02/10] MON-74 add group by to all queries --- cloud/azure/app-services/monitors-app_services.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 2c2f80e..6bf3fd6 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -14,7 +14,7 @@ resource "datadog_monitor" "appservices_response_time" { query = <= ${var.response_time_threshold_critical} EOF @@ -43,7 +43,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" { query = <= ${var.memory_usage_threshold_critical} EOF @@ -72,7 +72,7 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { query = < ${var.http_404_errors_count_rate_threshold_critical} EOF @@ -102,7 +102,7 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { query = < Date: Mon, 27 Nov 2017 23:29:06 +0100 Subject: [PATCH 03/10] MON-74 fix percent query --- cloud/azure/app-services/monitors-app_services.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 6bf3fd6..aedc748 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -101,8 +101,8 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { query = < Date: Thu, 7 Dec 2017 10:40:04 +0100 Subject: [PATCH 04/10] MON-74 percent for requests --- cloud/azure/app-services/README.md | 15 ++++------ cloud/azure/app-services/inputs.tf | 30 +++++++------------ .../app-services/monitors-app_services.tf | 23 +++++++------- 3 files changed, 29 insertions(+), 39 deletions(-) diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index e56fac2..ab49366 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -19,9 +19,8 @@ Creates a DataDog monitors with the following checks : * Response time * Memory usage count -* HTTP 404 errors -* HTTP 50x errors -* HTTP 20x rate +* HTTP 404 requests +* HTTP 2xx requests Inputs ------ @@ -32,12 +31,10 @@ Inputs | environment | Architecture environment | string | - | yes | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | -| http_2xx_status_rate_limit | | string | `30` | no | -| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | -| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | -| http_404_errors_count_rate_limit | | string | `30` | no | -| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | -| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | +| http_2xx_requests_threshold_critical | Minimum critical acceptable percent of 2xx requests | string | `90` | no | +| http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no | +| http_404_requests_threshold_critical | Maximum critical acceptable percent of 404 errors | string | `40` | no | +| http_404_requests_threshold_warning | Maximum warning acceptable percent of 404 errors | string | `30` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | | message | Message sent when a monitor is triggered | string | - | yes | diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index c4bc451..541a0e7 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -54,34 +54,26 @@ variable "memory_usage_threshold_warning" { ### HTTP 404 status pages ### ################################# -variable "http_404_errors_count_rate_limit" { - default = 30 +variable "http_404_requests_threshold_critical" { + default = 40 + description = "Maximum critical acceptable percent of 404 errors" } -variable "http_404_errors_count_rate_threshold_critical" { +variable "http_404_requests_threshold_warning" { default = 30 - description = "Alerting threshold (number of requests)" -} - -variable "http_404_errors_count_rate_threshold_warning" { - default = 10 - description = "Warning threshold (number of requests)" + description = "Maximum critical acceptable percent of 404 errors" } ################################# ### HTTP 202 status pages ### ################################# -variable "http_2xx_status_rate_limit" { - default = 30 +variable "http_2xx_requests_threshold_critical" { + default = 90 + description = "Minimum critical acceptable percent of 2xx requests" } -variable "http_2xx_status_rate_threshold_critical" { - default = 0.9 - description = "Alerting threshold (percentage)" -} - -variable "http_2xx_status_rate_threshold_warning" { - default = 0.95 - description = "Warning threshold (percentage)" +variable "http_2xx_requests_threshold_warning" { + default = 95 + description = "Minimum warning acceptable percent of 2xx requests" } diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index aedc748..0abc8fd 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -15,7 +15,7 @@ resource "datadog_monitor" "appservices_response_time" { query = <= ${var.response_time_threshold_critical} + ) > ${var.response_time_threshold_critical} EOF evaluation_delay = "${var.delay}" @@ -44,7 +44,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" { query = <= ${var.memory_usage_threshold_critical} + ) > ${var.memory_usage_threshold_critical} EOF evaluation_delay = "${var.delay}" @@ -71,17 +71,18 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { message = "${var.message}" query = < ${var.http_404_errors_count_rate_threshold_critical} + sum(last_5m): ( + avg:azure.app_services.http404{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() / + avg:azure.app_services.requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + ) * 100 > ${var.http_404_requests_threshold_critical} EOF evaluation_delay = "${var.delay}" new_host_delay = "${var.delay}" thresholds { - warning = "${var.http_404_errors_count_rate_threshold_warning}" - critical = "${var.http_404_errors_count_rate_threshold_critical}" + warning = "${var.http_404_requests_threshold_warning}" + critical = "${var.http_404_requests_threshold_critical}" } notify_no_data = false # Will NOT notify when no data is received @@ -102,16 +103,16 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { query = < Date: Mon, 11 Dec 2017 11:41:32 +0100 Subject: [PATCH 05/10] MON-74 Fix non existent variable --- cloud/azure/app-services/monitors-app_services.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 0abc8fd..669e9db 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -66,7 +66,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" { # Monitoring App Services 404 errors rate resource "datadog_monitor" "appservices_http_404_errors_count" { - name = "[${var.environment}] App Services HTTP errors > ${var.http_404_errors_count_rate_limit} limit on {{name}}" + name = "[${var.environment}] App Services HTTP errors > {{value}}% limit on {{name}}" type = "metric alert" message = "${var.message}" From 7de2bf4aca61736a28d143260e4d91799bca2ec6 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 15 Dec 2017 13:13:09 +0100 Subject: [PATCH 06/10] MON-74 decrease thresholds for 404 errors --- cloud/azure/app-services/inputs.tf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 541a0e7..96c2892 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -55,17 +55,17 @@ variable "memory_usage_threshold_warning" { ################################# variable "http_404_requests_threshold_critical" { - default = 40 - description = "Maximum critical acceptable percent of 404 errors" -} - -variable "http_404_requests_threshold_warning" { default = 30 description = "Maximum critical acceptable percent of 404 errors" } +variable "http_404_requests_threshold_warning" { + default = 15 + description = "Maximum warning acceptable percent of 404 errors" +} + ################################# -### HTTP 202 status pages ### +### HTTP 2xx status pages ### ################################# variable "http_2xx_requests_threshold_critical" { From 6cb41b8fbb08f424f59ab13187028809d2b2a984 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 15 Dec 2017 13:14:30 +0100 Subject: [PATCH 07/10] MON-74 fix response time monitor name --- cloud/azure/app-services/monitors-app_services.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 669e9db..d473fd5 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -8,7 +8,7 @@ data "template_file" "filter" { # Monitoring App Services response time resource "datadog_monitor" "appservices_response_time" { - name = "[${var.environment}] App Services response time > ${var.response_time_threshold_critical}s on {{name}}" + name = "[${var.environment}] App Services response time of {{value}}s is to high on {{name}}" type = "metric alert" message = "${var.message}" From 3a56b974c106da967214dc387020623a5e427da5 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 15 Dec 2017 13:28:11 +0100 Subject: [PATCH 08/10] MON-74 Add 5xx errors monitor --- cloud/azure/app-services/README.md | 3 ++ cloud/azure/app-services/inputs.tf | 14 +++++++ .../app-services/monitors-app_services.tf | 38 +++++++++++++++++-- 3 files changed, 51 insertions(+), 4 deletions(-) diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index ab49366..b439492 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -19,6 +19,7 @@ Creates a DataDog monitors with the following checks : * Response time * Memory usage count +* HTTP 5xx requests * HTTP 404 requests * HTTP 2xx requests @@ -35,6 +36,8 @@ Inputs | http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no | | http_404_requests_threshold_critical | Maximum critical acceptable percent of 404 errors | string | `40` | no | | http_404_requests_threshold_warning | Maximum warning acceptable percent of 404 errors | string | `30` | no | +| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `40` | no | +| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `30` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | | message | Message sent when a monitor is triggered | string | - | yes | diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 96c2892..bc50156 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -50,6 +50,20 @@ variable "memory_usage_threshold_warning" { description = "Warning threshold in MiB" } +################################# +### HTTP 5xx status pages ### +################################# + +variable "http_5xx_requests_threshold_critical" { + default = 20 + description = "Maximum critical acceptable percent of 5xx errors" +} + +variable "http_5xx_requests_threshold_warning" { + default = 10 + description = "Maximum warning acceptable percent of 5xx errors" +} + ################################# ### HTTP 404 status pages ### ################################# diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index d473fd5..3f8b49b 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -64,9 +64,39 @@ resource "datadog_monitor" "appservices_memory_usage_count" { tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } -# Monitoring App Services 404 errors rate +# Monitoring App Services 5xx errors percent +resource "datadog_monitor" "appservices_http_5xx_errors_count" { + name = "[${var.environment}] App Services HTTP 5xx errors is {{value}}% above the limit on {{name}}" + type = "metric alert" + message = "${var.message}" + + query = < ${var.http_5xx_requests_threshold_critical} + EOF + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + thresholds { + warning = "${var.http_5xx_requests_threshold_warning}" + critical = "${var.http_5xx_requests_threshold_critical}" + } + + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = 0 + require_full_window = true + timeout_h = 0 + include_tags = true + + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] +} + +# Monitoring App Services 404 errors percent resource "datadog_monitor" "appservices_http_404_errors_count" { - name = "[${var.environment}] App Services HTTP errors > {{value}}% limit on {{name}}" + name = "[${var.environment}] App Services HTTP 404 errors is {{value}}% above the limit on {{name}}" type = "metric alert" message = "${var.message}" @@ -94,9 +124,9 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } -# Monitoring App Services HTTP 2xx status pages rate +# Monitoring App Services HTTP 2xx status pages percent resource "datadog_monitor" "appservices_http_2xx_status_rate" { - name = "[${var.environment}] App Services Too much non 2xx HTTP status in response to the requests on {{name}}" + name = "[${var.environment}] App Services HTTP 2xx responses is {{value}}% below the limit on {{name}}" type = "metric alert" message = "${var.message}" From e3e3469cfbac1f36c0e8abfe5a8447145447bfbd Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 15 Dec 2017 13:29:25 +0100 Subject: [PATCH 09/10] MON-74 Change 404 errors to 4xx --- cloud/azure/app-services/README.md | 6 +++--- cloud/azure/app-services/inputs.tf | 10 +++++----- cloud/azure/app-services/monitors-app_services.tf | 14 +++++++------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index b439492..fac9581 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -20,7 +20,7 @@ Creates a DataDog monitors with the following checks : * Response time * Memory usage count * HTTP 5xx requests -* HTTP 404 requests +* HTTP 4xx requests * HTTP 2xx requests Inputs @@ -34,8 +34,8 @@ Inputs | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | http_2xx_requests_threshold_critical | Minimum critical acceptable percent of 2xx requests | string | `90` | no | | http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no | -| http_404_requests_threshold_critical | Maximum critical acceptable percent of 404 errors | string | `40` | no | -| http_404_requests_threshold_warning | Maximum warning acceptable percent of 404 errors | string | `30` | no | +| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `40` | no | +| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `30` | no | | http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `40` | no | | http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `30` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index bc50156..3085251 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -65,17 +65,17 @@ variable "http_5xx_requests_threshold_warning" { } ################################# -### HTTP 404 status pages ### +### HTTP 4xx status pages ### ################################# -variable "http_404_requests_threshold_critical" { +variable "http_4xx_requests_threshold_critical" { default = 30 - description = "Maximum critical acceptable percent of 404 errors" + description = "Maximum critical acceptable percent of 4xx errors" } -variable "http_404_requests_threshold_warning" { +variable "http_4xx_requests_threshold_warning" { default = 15 - description = "Maximum warning acceptable percent of 404 errors" + description = "Maximum warning acceptable percent of 4xx errors" } ################################# diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 3f8b49b..02cf2d9 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -94,25 +94,25 @@ resource "datadog_monitor" "appservices_http_5xx_errors_count" { tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } -# Monitoring App Services 404 errors percent -resource "datadog_monitor" "appservices_http_404_errors_count" { - name = "[${var.environment}] App Services HTTP 404 errors is {{value}}% above the limit on {{name}}" +# Monitoring App Services 4xx errors percent +resource "datadog_monitor" "appservices_http_4xx_errors_count" { + name = "[${var.environment}] App Services HTTP 4xx errors is {{value}}% above the limit on {{name}}" type = "metric alert" message = "${var.message}" query = < ${var.http_404_requests_threshold_critical} + ) * 100 > ${var.http_4xx_requests_threshold_critical} EOF evaluation_delay = "${var.delay}" new_host_delay = "${var.delay}" thresholds { - warning = "${var.http_404_requests_threshold_warning}" - critical = "${var.http_404_requests_threshold_critical}" + warning = "${var.http_4xx_requests_threshold_warning}" + critical = "${var.http_4xx_requests_threshold_critical}" } notify_no_data = false # Will NOT notify when no data is received From b2d807fa46495572e8981ae749fa6ca7ad854826 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 15 Dec 2017 15:51:55 +0100 Subject: [PATCH 10/10] MON-74 update readme with new thresholds --- cloud/azure/app-services/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index fac9581..dc9e526 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -34,10 +34,10 @@ Inputs | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | http_2xx_requests_threshold_critical | Minimum critical acceptable percent of 2xx requests | string | `90` | no | | http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no | -| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `40` | no | -| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `30` | no | -| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `40` | no | -| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `30` | no | +| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no | +| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no | +| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no | +| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | | message | Message sent when a monitor is triggered | string | - | yes |