Merged in MON-74-azure-appservices-monitors (pull request #28)

MON-74 azure appservices monitors Approved-by: Quentin Manfroi <quentin.manfroi@yahoo.fr> Approved-by: Laurent Piroelle <laurent.piroelle@fr.clara.net> Approved-by: Alexandre Gaillet <alexandre.gaillet@fr.clara.net> Approved-by: Jérôme Respaut <shr3ps@gmail.com>
2017-12-15 15:38:02 +00:00 · 2017-12-15 15:38:02 +00:00 · 557637f530
commit 557637f530
parent 49624998a9 b2d807fa46
3 changed files with 88 additions and 51 deletions
--- a/cloud/azure/app-services/README.md
+++ b/cloud/azure/app-services/README.md
@ -19,9 +19,9 @@ Creates a DataDog monitors with the following checks :
 * Response time
 * Memory usage count
-* HTTP 404 errors
+* HTTP 5xx requests
-* HTTP 50x errors
+* HTTP 4xx requests
-* HTTP 20x rate
+* HTTP 2xx requests
 Inputs
 ------
@ -32,12 +32,12 @@ Inputs
 | environment | Architecture environment | string | - | yes |
 | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
 | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
-| http_2xx_status_rate_limit |  | string | `30` | no |
+| http_2xx_requests_threshold_critical | Minimum critical acceptable percent of 2xx requests | string | `90` | no |
-| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no |
+| http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no |
-| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no |
+| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no |
-| http_404_errors_count_rate_limit |  | string | `30` | no |
+| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no |
-| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no |
+| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no |
-| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no |
+| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no |
 | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
 | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
 | message | Message sent when a monitor is triggered | string | - | yes |
--- a/cloud/azure/app-services/inputs.tf
+++ b/cloud/azure/app-services/inputs.tf
@ -51,37 +51,43 @@ variable "memory_usage_threshold_warning" {
 }
 #################################
-###   HTTP 404 status pages   ###
+###   HTTP 5xx status pages   ###
 #################################
-variable "http_404_errors_count_rate_limit" {
+variable "http_5xx_requests_threshold_critical" {
-  default = 30
+  default     = 20
  description = "Maximum critical acceptable percent of 5xx errors"
 }
-variable "http_404_errors_count_rate_threshold_critical" {
+variable "http_5xx_requests_threshold_warning" {
  default     = 30
  description = "Alerting threshold (number of requests)"
 }
 variable "http_404_errors_count_rate_threshold_warning" {
  default     = 10
-  description = "Warning threshold (number of requests)"
+  description = "Maximum warning acceptable percent of 5xx errors"
 }
 #################################
-###   HTTP 202 status pages   ###
+###   HTTP 4xx status pages   ###
 #################################
-variable "http_2xx_status_rate_limit" {
+variable "http_4xx_requests_threshold_critical" {
-  default = 30
+  default     = 30
  description = "Maximum critical acceptable percent of 4xx errors"
 }
-variable "http_2xx_status_rate_threshold_critical" {
+variable "http_4xx_requests_threshold_warning" {
-  default     = 0.9
+  default     = 15
-  description = "Alerting threshold (percentage)"
+  description = "Maximum warning acceptable percent of 4xx errors"
 }
-variable "http_2xx_status_rate_threshold_warning" {
+#################################
-  default     = 0.95
+###   HTTP 2xx status pages   ###
-  description = "Warning threshold (percentage)"
+#################################
 variable "http_2xx_requests_threshold_critical" {
  default     = 90
  description = "Minimum critical acceptable percent of 2xx requests"
 }
 variable "http_2xx_requests_threshold_warning" {
  default     = 95
  description = "Minimum warning acceptable percent of 2xx requests"
 }
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@ -8,14 +8,14 @@ data "template_file" "filter" {
 # Monitoring App Services response time
 resource "datadog_monitor" "appservices_response_time" {
-  name    = "[${var.environment}] App Services response time > ${var.response_time_threshold_critical}s on {{name}}"
+  name    = "[${var.environment}] App Services response time of {{value}}s is to high on {{name}}"
  type    = "metric alert"
  message = "${var.message}"
  query = <<EOF
    avg(last_5m): (
-      avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}}
+      avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}} by {resource_group,region,name}
-    ) >= ${var.response_time_threshold_critical}
+    ) > ${var.response_time_threshold_critical}
  EOF
  evaluation_delay = "${var.delay}"
@ -43,8 +43,8 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
  query = <<EOF
    avg(last_5m): (
-      avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}}
+      avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}} by {resource_group,region,name}
-    ) >= ${var.memory_usage_threshold_critical}
+    ) > ${var.memory_usage_threshold_critical}
  EOF
  evaluation_delay = "${var.delay}"
@ -64,24 +64,25 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
  tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"]
 }
-# Monitoring App Services 404 errors rate
+# Monitoring App Services 5xx errors percent
-resource "datadog_monitor" "appservices_http_404_errors_count" {
+resource "datadog_monitor" "appservices_http_5xx_errors_count" {
-  name    = "[${var.environment}] App Services HTTP errors > ${var.http_404_errors_count_rate_limit} limit on {{name}}"
+  name    = "[${var.environment}] App Services HTTP 5xx errors is {{value}}% above the limit on {{name}}"
  type    = "metric alert"
  message = "${var.message}"
  query = <<EOF
-    max(last_5m): (
+    sum(last_5m): (
-      per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate())
+      avg:azure.app_services.http5xx{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
-    ) > ${var.http_404_errors_count_rate_threshold_critical}
+      avg:azure.app_services.requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
    ) * 100 > ${var.http_5xx_requests_threshold_critical}
  EOF
  evaluation_delay = "${var.delay}"
  new_host_delay   = "${var.delay}"
  thresholds {
-    warning  = "${var.http_404_errors_count_rate_threshold_warning}"
+    warning  = "${var.http_5xx_requests_threshold_warning}"
-    critical = "${var.http_404_errors_count_rate_threshold_critical}"
+    critical = "${var.http_5xx_requests_threshold_critical}"
  }
  notify_no_data      = false # Will NOT notify when no data is received
@ -93,28 +94,58 @@ resource "datadog_monitor" "appservices_http_404_errors_count" {
  tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"]
 }
-# Monitoring App Services HTTP 2xx status pages rate
+# Monitoring App Services 4xx errors percent
-resource "datadog_monitor" "appservices_http_2xx_status_rate" {
+resource "datadog_monitor" "appservices_http_4xx_errors_count" {
-  name    = "[${var.environment}] App Services Too much non 2xx HTTP status in response to the requests on {{name}}"
+  name    = "[${var.environment}] App Services HTTP 4xx errors is {{value}}% above the limit on {{name}}"
  type    = "metric alert"
  message = "${var.message}"
  query = <<EOF
-    avg(last_5m): (
+    sum(last_5m): (
-      avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() /
+      avg:azure.app_services.http4xx{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
-        avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count()
+      avg:azure.app_services.requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
-    ) < ${var.http_2xx_status_rate_threshold_critical}
+    ) * 100 > ${var.http_4xx_requests_threshold_critical}
  EOF
  evaluation_delay = "${var.delay}"
  new_host_delay   = "${var.delay}"
  thresholds {
-    warning  = "${var.http_2xx_status_rate_threshold_warning}"
+    warning  = "${var.http_4xx_requests_threshold_warning}"
-    critical = "${var.http_2xx_status_rate_threshold_critical}"
+    critical = "${var.http_4xx_requests_threshold_critical}"
  }
-  notify_no_data      = true  # Will notify when no data is received
+  notify_no_data      = false # Will NOT notify when no data is received
  renotify_interval   = 0
  require_full_window = true
  timeout_h           = 0
  include_tags        = true
  tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"]
 }
 # Monitoring App Services HTTP 2xx status pages percent
 resource "datadog_monitor" "appservices_http_2xx_status_rate" {
  name    = "[${var.environment}] App Services HTTP 2xx responses is {{value}}% below the limit on {{name}}"
  type    = "metric alert"
  message = "${var.message}"
  query = <<EOF
    sum(last_5m): (
      avg:azure.app_services.http2xx{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
      avg:azure.app_services.requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
    ) * 100 < ${var.http_2xx_requests_threshold_critical}
  EOF
  evaluation_delay = "${var.delay}"
  new_host_delay   = "${var.delay}"
  thresholds {
    warning  = "${var.http_2xx_requests_threshold_warning}"
    critical = "${var.http_2xx_requests_threshold_critical}"
  }
  notify_no_data      = false  # Will notify when no data is received
  renotify_interval   = 0
  require_full_window = true
  timeout_h           = 0