From 81df985f3297bcf3e993fef42f3a98146339bce0 Mon Sep 17 00:00:00 2001
From: Marc-Antoine ADELISE <maadelise@morea.fr>
Date: Tue, 31 Oct 2017 10:08:19 +0100
Subject: [PATCH] MON-74: Response time, memory usage, http 404 status code and
 non 2xx http response status code percentage monitoring.

---
 cloud/azure/app-services/inputs.tf            | 158 ++++++++++++++++--
 .../app-services/monitors-app_services.tf     |  81 +++++++--
 2 files changed, 215 insertions(+), 24 deletions(-)

diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf
index 830fcc2..666a394 100644
--- a/cloud/azure/app-services/inputs.tf
+++ b/cloud/azure/app-services/inputs.tf
@@ -1,7 +1,13 @@
-variable "filter_tags" {
+variable "environment" {}
+
+variable "use_filter_tags" {
   default = "*"
 }
 
+variable "critical_escalation_group" {
+  default = "HO_Dummy"
+}
+
 ###################################
 ###   RESPONSE TIME VARIABLES   ###
 ###################################
@@ -9,7 +15,7 @@ variable "response_time_appserv_eval_delay" {
   default = 600
 }
 
-variable "response_time_critical_threshold" {
+variable "response_time_threshold_critical" {
   default = 0.8
   description = "Alerting threshold in seconds"
 }
@@ -24,6 +30,11 @@ variable "response_time_last_time_window_code" {
   description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
 }
 
+variable "response_time_require_full_window" {
+  default = false
+  description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
+}
+
 variable "response_time_tags" {
   default = []
   description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
@@ -39,10 +50,10 @@ variable "response_time_include_tags" {
   description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
 }
 
-variable "response_time_notify_no_data" {
-  default = true
-  description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
-}
+# variable "response_time_notify_no_data" {
+#   default = true
+#   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
+# }
 
 variable "response_time_renotify_interval" {
   default = 0
@@ -76,6 +87,11 @@ variable "memory_usage_last_time_window_code" {
   description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
 }
 
+variable "memory_usage_require_full_window" {
+  default = false
+  description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
+}
+
 variable "memory_usage_tags" {
   default = []
   description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
@@ -91,10 +107,10 @@ variable "memory_usage_include_tags" {
   description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
 }
 
-variable "memory_usage_notify_no_data" {
-  default = true
-  description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
-}
+# variable "memory_usage_notify_no_data" {
+#   default = true
+#   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
+# }
 
 variable "memory_usage_renotify_interval" {
   default = 0
@@ -105,3 +121,125 @@ variable "memory_usage_escalation_message" {
   default = "Escalation message @pagerduty"
   description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
 }
+
+#################################
+###   HTTP 404 status pages   ###
+#################################
+variable "http_404_errors_count_rate_limit" {
+  default = 30
+}
+
+variable "http_404_errors_count_rate_appserv_eval_delay" {
+  default = 600
+}
+
+variable "http_404_errors_count_rate_threshold_critical" {
+  default = 30
+  description = "Alerting threshold (number of requests)"
+}
+
+variable "http_404_errors_count_rate_threshold_warning" {
+  default = 10
+  description = "Warning threshold (number of requests)"
+}
+
+variable "http_404_errors_count_rate_last_time_window_code" {
+  default = "5m"
+  description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
+}
+
+variable "http_404_errors_count_rate_require_full_window" {
+  default = true
+  description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
+}
+
+variable "http_404_errors_count_rate_tags" {
+  default = []
+  description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
+}
+
+variable "http_404_errors_count_rate_timeout_h" {
+  default = false
+  description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
+}
+
+variable "http_404_errors_count_rate_include_tags" {
+  default = false
+  description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
+}
+
+# variable "http_404_errors_count_rate_notify_no_data" {
+#   default = true
+#   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
+# }
+
+variable "http_404_errors_count_rate_renotify_interval" {
+  default = 0
+  description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
+}
+
+variable "http_404_errors_count_rate_escalation_message" {
+  default = "Escalation message @pagerduty"
+  description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
+}
+
+#################################
+###   HTTP 202 status pages   ###
+#################################
+variable "http_2xx_status_rate_limit" {
+  default = 30
+}
+
+variable "http_2xx_status_rate_appserv_eval_delay" {
+  default = 600
+}
+
+variable "http_2xx_status_rate_threshold_critical" {
+  default = 0.9
+  description = "Alerting threshold (percentage)"
+}
+
+variable "http_2xx_status_rate_threshold_warning" {
+  default = 0.95
+  description = "Warning threshold (percentage)"
+}
+
+variable "http_2xx_status_rate_last_time_window_code" {
+  default = "5m"
+  description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
+}
+
+variable "http_2xx_status_rate_require_full_window" {
+  default = true
+  description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
+}
+
+variable "http_2xx_status_rate_tags" {
+  default = []
+  description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
+}
+
+variable "http_2xx_status_rate_timeout_h" {
+  default = false
+  description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
+}
+
+variable "http_2xx_status_rate_include_tags" {
+  default = false
+  description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
+}
+
+# variable "http_2xx_status_rate_notify_no_data" {
+#   default = true
+#   description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
+# }
+
+variable "http_2xx_status_rate_renotify_interval" {
+  default = 0
+  description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
+}
+
+variable "http_2xx_status_rate_escalation_message" {
+  default = "Escalation message @pagerduty"
+  description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
+}
diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf
index 7bf1f99..892b2c4 100644
--- a/cloud/azure/app-services/monitors-app_services.tf
+++ b/cloud/azure/app-services/monitors-app_services.tf
@@ -1,30 +1,31 @@
 # Monitoring App Services response time
-resource "datadog_monitor" "appservices_reponse_time" {
-  name               = "[${var.environment}] App Services response time {{value}}s is above ${var.reponse_time_threshold_critical}s"
+resource "datadog_monitor" "appservices_response_time" {
+  name               = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s"
   type               = "query alert"
   message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
-  escalation_message = "${var.reponse_time_escalation_message}"
+  escalation_message = "${var.response_time_escalation_message}"
 
-  query = "avg(last_${var.reponse_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.reponse_time_threshold_critical}"
+  query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.response_time_threshold_critical}"
 
-  evaluation_delay = "${var.reponse_time_appserv_eval_delay}"
+  evaluation_delay = "${var.response_time_appserv_eval_delay}"
 
   thresholds {
-    warning  = "${var.reponse_time_threshold_warning}"
-    critical = "${var.reponse_time_threshold_critical}"
+    warning  = "${var.response_time_threshold_warning}"
+    critical = "${var.response_time_threshold_critical}"
   }
 
-  notify_no_data    = "${var.reponse_time_notify_no_data}"
-  renotify_interval = "${var.reponse_time_renotify_interval}"
+  notify_no_data    = true # Will notify when no data is received
+  renotify_interval = "${var.response_time_renotify_interval}"
+  require_full_window = "${var.response_time_require_full_window}"
 
-  timeout_h    = "${var.reponse_time_timeout_h}"
-  include_tags = "${var.reponse_time_include_tags}"
+  timeout_h    = "${var.response_time_timeout_h}"
+  include_tags = "${var.response_time_include_tags}"
 
-  tags = "${var.reponse_time_tags}"
+  tags = "${var.response_time_tags}"
 }
 
 # Monitoring App Services memory usage
-resource "datadog_monitor" "appservices_memory_usage" {
+resource "datadog_monitor" "appservices_memory_usage_count" {
   name               = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB"
   type               = "query alert"
   message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
@@ -39,11 +40,63 @@ resource "datadog_monitor" "appservices_memory_usage" {
     critical = "${var.memory_usage_threshold_critical}"
   }
 
-  notify_no_data    = "${var.memory_usage_notify_no_data}"
+  notify_no_data    = true # Will notify when no data is received
   renotify_interval = "${var.memory_usage_renotify_interval}"
+  require_full_window = "${var.memory_usage_require_full_window}"
 
   timeout_h    = "${var.memory_usage_timeout_h}"
   include_tags = "${var.memory_usage_include_tags}"
 
   tags = "${var.memory_usage_tags}"
 }
+
+# Monitoring App Services 404 errors rate
+resource "datadog_monitor" "appservices_http_404_errors_count" {
+  name               = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit"
+  type               = "query alert"
+  message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
+  escalation_message = "${var.http_404_errors_count_rate_escalation_message}"
+
+  query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{*}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}"
+
+  evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}"
+
+  thresholds {
+    warning  = "${var.http_404_errors_count_rate_threshold_warning}"
+    critical = "${var.http_404_errors_count_rate_threshold_critical}"
+  }
+
+  notify_no_data    = false # Will NOT notify when no data is received
+  renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}"
+  require_full_window = true
+
+  timeout_h    = "${var.http_404_errors_count_rate_timeout_h}"
+  include_tags = "${var.http_404_errors_count_rate_include_tags}"
+
+  tags = "${var.http_404_errors_count_rate_tags}"
+}
+
+# Monitoring App Services HTTP 2xx status pages rate
+resource "datadog_monitor" "appservices_http_2xx_status_rate" {
+  name               = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests"
+  type               = "query alert"
+  message            = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
+  escalation_message = "${var.http_2xx_status_rate_escalation_message}"
+
+  query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}"
+  evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}"
+
+  thresholds {
+    warning  = "${var.http_2xx_status_rate_threshold_warning}"
+    critical = "${var.http_2xx_status_rate_threshold_critical}"
+  }
+
+  notify_no_data    = true # Will notify when no data is received
+  renotify_interval = "${var.http_2xx_status_rate_renotify_interval}"
+  require_full_window = true
+
+  timeout_h    = "${var.http_2xx_status_rate_timeout_h}"
+  include_tags = "${var.http_2xx_status_rate_include_tags}"
+
+  tags = "${var.http_2xx_status_rate_tags}"
+}