MON-74: Response time, memory usage, http 404 status code and non 2xx http response status code percentage monitoring.

This commit is contained in:
Marc-Antoine ADELISE 2017-10-31 10:08:19 +01:00 committed by Laurent Piroelle
parent 6c5bdaa042
commit 81df985f32
2 changed files with 215 additions and 24 deletions

View File

@ -1,7 +1,13 @@
variable "filter_tags" {
variable "environment" {}
variable "use_filter_tags" {
default = "*"
}
variable "critical_escalation_group" {
default = "HO_Dummy"
}
###################################
### RESPONSE TIME VARIABLES ###
###################################
@ -9,7 +15,7 @@ variable "response_time_appserv_eval_delay" {
default = 600
}
variable "response_time_critical_threshold" {
variable "response_time_threshold_critical" {
default = 0.8
description = "Alerting threshold in seconds"
}
@ -24,6 +30,11 @@ variable "response_time_last_time_window_code" {
description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
}
variable "response_time_require_full_window" {
default = false
description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
}
variable "response_time_tags" {
default = []
description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
@ -39,10 +50,10 @@ variable "response_time_include_tags" {
description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
}
variable "response_time_notify_no_data" {
default = true
description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
}
# variable "response_time_notify_no_data" {
# default = true
# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
# }
variable "response_time_renotify_interval" {
default = 0
@ -76,6 +87,11 @@ variable "memory_usage_last_time_window_code" {
description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
}
variable "memory_usage_require_full_window" {
default = false
description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
}
variable "memory_usage_tags" {
default = []
description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
@ -91,10 +107,10 @@ variable "memory_usage_include_tags" {
description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
}
variable "memory_usage_notify_no_data" {
default = true
description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
}
# variable "memory_usage_notify_no_data" {
# default = true
# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
# }
variable "memory_usage_renotify_interval" {
default = 0
@ -105,3 +121,125 @@ variable "memory_usage_escalation_message" {
default = "Escalation message @pagerduty"
description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
}
#################################
### HTTP 404 status pages ###
#################################
variable "http_404_errors_count_rate_limit" {
default = 30
}
variable "http_404_errors_count_rate_appserv_eval_delay" {
default = 600
}
variable "http_404_errors_count_rate_threshold_critical" {
default = 30
description = "Alerting threshold (number of requests)"
}
variable "http_404_errors_count_rate_threshold_warning" {
default = 10
description = "Warning threshold (number of requests)"
}
variable "http_404_errors_count_rate_last_time_window_code" {
default = "5m"
description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
}
variable "http_404_errors_count_rate_require_full_window" {
default = true
description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
}
variable "http_404_errors_count_rate_tags" {
default = []
description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
}
variable "http_404_errors_count_rate_timeout_h" {
default = false
description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
}
variable "http_404_errors_count_rate_include_tags" {
default = false
description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
}
# variable "http_404_errors_count_rate_notify_no_data" {
# default = true
# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
# }
variable "http_404_errors_count_rate_renotify_interval" {
default = 0
description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
}
variable "http_404_errors_count_rate_escalation_message" {
default = "Escalation message @pagerduty"
description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
}
#################################
### HTTP 202 status pages ###
#################################
variable "http_2xx_status_rate_limit" {
default = 30
}
variable "http_2xx_status_rate_appserv_eval_delay" {
default = 600
}
variable "http_2xx_status_rate_threshold_critical" {
default = 0.9
description = "Alerting threshold (percentage)"
}
variable "http_2xx_status_rate_threshold_warning" {
default = 0.95
description = "Warning threshold (percentage)"
}
variable "http_2xx_status_rate_last_time_window_code" {
default = "5m"
description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
}
variable "http_2xx_status_rate_require_full_window" {
default = true
description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
}
variable "http_2xx_status_rate_tags" {
default = []
description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
}
variable "http_2xx_status_rate_timeout_h" {
default = false
description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
}
variable "http_2xx_status_rate_include_tags" {
default = false
description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
}
# variable "http_2xx_status_rate_notify_no_data" {
# default = true
# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
# }
variable "http_2xx_status_rate_renotify_interval" {
default = 0
description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
}
variable "http_2xx_status_rate_escalation_message" {
default = "Escalation message @pagerduty"
description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
}

View File

@ -1,30 +1,31 @@
# Monitoring App Services response time
resource "datadog_monitor" "appservices_reponse_time" {
name = "[${var.environment}] App Services response time {{value}}s is above ${var.reponse_time_threshold_critical}s"
resource "datadog_monitor" "appservices_response_time" {
name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s"
type = "query alert"
message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
escalation_message = "${var.reponse_time_escalation_message}"
escalation_message = "${var.response_time_escalation_message}"
query = "avg(last_${var.reponse_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.reponse_time_threshold_critical}"
query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.response_time_threshold_critical}"
evaluation_delay = "${var.reponse_time_appserv_eval_delay}"
evaluation_delay = "${var.response_time_appserv_eval_delay}"
thresholds {
warning = "${var.reponse_time_threshold_warning}"
critical = "${var.reponse_time_threshold_critical}"
warning = "${var.response_time_threshold_warning}"
critical = "${var.response_time_threshold_critical}"
}
notify_no_data = "${var.reponse_time_notify_no_data}"
renotify_interval = "${var.reponse_time_renotify_interval}"
notify_no_data = true # Will notify when no data is received
renotify_interval = "${var.response_time_renotify_interval}"
require_full_window = "${var.response_time_require_full_window}"
timeout_h = "${var.reponse_time_timeout_h}"
include_tags = "${var.reponse_time_include_tags}"
timeout_h = "${var.response_time_timeout_h}"
include_tags = "${var.response_time_include_tags}"
tags = "${var.reponse_time_tags}"
tags = "${var.response_time_tags}"
}
# Monitoring App Services memory usage
resource "datadog_monitor" "appservices_memory_usage" {
resource "datadog_monitor" "appservices_memory_usage_count" {
name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB"
type = "query alert"
message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
@ -39,11 +40,63 @@ resource "datadog_monitor" "appservices_memory_usage" {
critical = "${var.memory_usage_threshold_critical}"
}
notify_no_data = "${var.memory_usage_notify_no_data}"
notify_no_data = true # Will notify when no data is received
renotify_interval = "${var.memory_usage_renotify_interval}"
require_full_window = "${var.memory_usage_require_full_window}"
timeout_h = "${var.memory_usage_timeout_h}"
include_tags = "${var.memory_usage_include_tags}"
tags = "${var.memory_usage_tags}"
}
# Monitoring App Services 404 errors rate
resource "datadog_monitor" "appservices_http_404_errors_count" {
name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit"
type = "query alert"
message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
escalation_message = "${var.http_404_errors_count_rate_escalation_message}"
query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{*}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}"
evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}"
thresholds {
warning = "${var.http_404_errors_count_rate_threshold_warning}"
critical = "${var.http_404_errors_count_rate_threshold_critical}"
}
notify_no_data = false # Will NOT notify when no data is received
renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}"
require_full_window = true
timeout_h = "${var.http_404_errors_count_rate_timeout_h}"
include_tags = "${var.http_404_errors_count_rate_include_tags}"
tags = "${var.http_404_errors_count_rate_tags}"
}
# Monitoring App Services HTTP 2xx status pages rate
resource "datadog_monitor" "appservices_http_2xx_status_rate" {
name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests"
type = "query alert"
message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
escalation_message = "${var.http_2xx_status_rate_escalation_message}"
query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}"
evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}"
thresholds {
warning = "${var.http_2xx_status_rate_threshold_warning}"
critical = "${var.http_2xx_status_rate_threshold_critical}"
}
notify_no_data = true # Will notify when no data is received
renotify_interval = "${var.http_2xx_status_rate_renotify_interval}"
require_full_window = true
timeout_h = "${var.http_2xx_status_rate_timeout_h}"
include_tags = "${var.http_2xx_status_rate_include_tags}"
tags = "${var.http_2xx_status_rate_tags}"
}