From 81df985f3297bcf3e993fef42f3a98146339bce0 Mon Sep 17 00:00:00 2001 From: Marc-Antoine ADELISE Date: Tue, 31 Oct 2017 10:08:19 +0100 Subject: [PATCH] MON-74: Response time, memory usage, http 404 status code and non 2xx http response status code percentage monitoring. --- cloud/azure/app-services/inputs.tf | 158 ++++++++++++++++-- .../app-services/monitors-app_services.tf | 81 +++++++-- 2 files changed, 215 insertions(+), 24 deletions(-) diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 830fcc2..666a394 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -1,7 +1,13 @@ -variable "filter_tags" { +variable "environment" {} + +variable "use_filter_tags" { default = "*" } +variable "critical_escalation_group" { + default = "HO_Dummy" +} + ################################### ### RESPONSE TIME VARIABLES ### ################################### @@ -9,7 +15,7 @@ variable "response_time_appserv_eval_delay" { default = 600 } -variable "response_time_critical_threshold" { +variable "response_time_threshold_critical" { default = 0.8 description = "Alerting threshold in seconds" } @@ -24,6 +30,11 @@ variable "response_time_last_time_window_code" { description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" } +variable "response_time_require_full_window" { + default = false + description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." +} + variable "response_time_tags" { default = [] description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" @@ -39,10 +50,10 @@ variable "response_time_include_tags" { description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." } -variable "response_time_notify_no_data" { - default = true - description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -} +# variable "response_time_notify_no_data" { +# default = true +# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +# } variable "response_time_renotify_interval" { default = 0 @@ -76,6 +87,11 @@ variable "memory_usage_last_time_window_code" { description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" } +variable "memory_usage_require_full_window" { + default = false + description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." +} + variable "memory_usage_tags" { default = [] description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" @@ -91,10 +107,10 @@ variable "memory_usage_include_tags" { description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." } -variable "memory_usage_notify_no_data" { - default = true - description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -} +# variable "memory_usage_notify_no_data" { +# default = true +# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +# } variable "memory_usage_renotify_interval" { default = 0 @@ -105,3 +121,125 @@ variable "memory_usage_escalation_message" { default = "Escalation message @pagerduty" description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." } + +################################# +### HTTP 404 status pages ### +################################# +variable "http_404_errors_count_rate_limit" { + default = 30 +} + +variable "http_404_errors_count_rate_appserv_eval_delay" { + default = 600 +} + +variable "http_404_errors_count_rate_threshold_critical" { + default = 30 + description = "Alerting threshold (number of requests)" +} + +variable "http_404_errors_count_rate_threshold_warning" { + default = 10 + description = "Warning threshold (number of requests)" +} + +variable "http_404_errors_count_rate_last_time_window_code" { + default = "5m" + description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" +} + +variable "http_404_errors_count_rate_require_full_window" { + default = true + description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." +} + +variable "http_404_errors_count_rate_tags" { + default = [] + description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" +} + +variable "http_404_errors_count_rate_timeout_h" { + default = false + description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." +} + +variable "http_404_errors_count_rate_include_tags" { + default = false + description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." +} + +# variable "http_404_errors_count_rate_notify_no_data" { +# default = true +# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +# } + +variable "http_404_errors_count_rate_renotify_interval" { + default = 0 + description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." +} + +variable "http_404_errors_count_rate_escalation_message" { + default = "Escalation message @pagerduty" + description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." +} + +################################# +### HTTP 202 status pages ### +################################# +variable "http_2xx_status_rate_limit" { + default = 30 +} + +variable "http_2xx_status_rate_appserv_eval_delay" { + default = 600 +} + +variable "http_2xx_status_rate_threshold_critical" { + default = 0.9 + description = "Alerting threshold (percentage)" +} + +variable "http_2xx_status_rate_threshold_warning" { + default = 0.95 + description = "Warning threshold (percentage)" +} + +variable "http_2xx_status_rate_last_time_window_code" { + default = "5m" + description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" +} + +variable "http_2xx_status_rate_require_full_window" { + default = true + description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." +} + +variable "http_2xx_status_rate_tags" { + default = [] + description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" +} + +variable "http_2xx_status_rate_timeout_h" { + default = false + description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." +} + +variable "http_2xx_status_rate_include_tags" { + default = false + description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." +} + +# variable "http_2xx_status_rate_notify_no_data" { +# default = true +# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +# } + +variable "http_2xx_status_rate_renotify_interval" { + default = 0 + description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." +} + +variable "http_2xx_status_rate_escalation_message" { + default = "Escalation message @pagerduty" + description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." +} diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 7bf1f99..892b2c4 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -1,30 +1,31 @@ # Monitoring App Services response time -resource "datadog_monitor" "appservices_reponse_time" { - name = "[${var.environment}] App Services response time {{value}}s is above ${var.reponse_time_threshold_critical}s" +resource "datadog_monitor" "appservices_response_time" { + name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" type = "query alert" message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" - escalation_message = "${var.reponse_time_escalation_message}" + escalation_message = "${var.response_time_escalation_message}" - query = "avg(last_${var.reponse_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.reponse_time_threshold_critical}" + query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.response_time_threshold_critical}" - evaluation_delay = "${var.reponse_time_appserv_eval_delay}" + evaluation_delay = "${var.response_time_appserv_eval_delay}" thresholds { - warning = "${var.reponse_time_threshold_warning}" - critical = "${var.reponse_time_threshold_critical}" + warning = "${var.response_time_threshold_warning}" + critical = "${var.response_time_threshold_critical}" } - notify_no_data = "${var.reponse_time_notify_no_data}" - renotify_interval = "${var.reponse_time_renotify_interval}" + notify_no_data = true # Will notify when no data is received + renotify_interval = "${var.response_time_renotify_interval}" + require_full_window = "${var.response_time_require_full_window}" - timeout_h = "${var.reponse_time_timeout_h}" - include_tags = "${var.reponse_time_include_tags}" + timeout_h = "${var.response_time_timeout_h}" + include_tags = "${var.response_time_include_tags}" - tags = "${var.reponse_time_tags}" + tags = "${var.response_time_tags}" } # Monitoring App Services memory usage -resource "datadog_monitor" "appservices_memory_usage" { +resource "datadog_monitor" "appservices_memory_usage_count" { name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" type = "query alert" message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" @@ -39,11 +40,63 @@ resource "datadog_monitor" "appservices_memory_usage" { critical = "${var.memory_usage_threshold_critical}" } - notify_no_data = "${var.memory_usage_notify_no_data}" + notify_no_data = true # Will notify when no data is received renotify_interval = "${var.memory_usage_renotify_interval}" + require_full_window = "${var.memory_usage_require_full_window}" timeout_h = "${var.memory_usage_timeout_h}" include_tags = "${var.memory_usage_include_tags}" tags = "${var.memory_usage_tags}" } + +# Monitoring App Services 404 errors rate +resource "datadog_monitor" "appservices_http_404_errors_count" { + name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit" + type = "query alert" + message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + escalation_message = "${var.http_404_errors_count_rate_escalation_message}" + + query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{*}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}" + + evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}" + + thresholds { + warning = "${var.http_404_errors_count_rate_threshold_warning}" + critical = "${var.http_404_errors_count_rate_threshold_critical}" + } + + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}" + require_full_window = true + + timeout_h = "${var.http_404_errors_count_rate_timeout_h}" + include_tags = "${var.http_404_errors_count_rate_include_tags}" + + tags = "${var.http_404_errors_count_rate_tags}" +} + +# Monitoring App Services HTTP 2xx status pages rate +resource "datadog_monitor" "appservices_http_2xx_status_rate" { + name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests" + type = "query alert" + message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + escalation_message = "${var.http_2xx_status_rate_escalation_message}" + + query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}" + evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}" + + thresholds { + warning = "${var.http_2xx_status_rate_threshold_warning}" + critical = "${var.http_2xx_status_rate_threshold_critical}" + } + + notify_no_data = true # Will notify when no data is received + renotify_interval = "${var.http_2xx_status_rate_renotify_interval}" + require_full_window = true + + timeout_h = "${var.http_2xx_status_rate_timeout_h}" + include_tags = "${var.http_2xx_status_rate_include_tags}" + + tags = "${var.http_2xx_status_rate_tags}" +}