MON-74: Response time, memory usage, http 404 status code and non 2xx http response status code percentage monitoring.
This commit is contained in:
parent
6c5bdaa042
commit
81df985f32
@ -1,7 +1,13 @@
|
|||||||
variable "filter_tags" {
|
variable "environment" {}
|
||||||
|
|
||||||
|
variable "use_filter_tags" {
|
||||||
default = "*"
|
default = "*"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "critical_escalation_group" {
|
||||||
|
default = "HO_Dummy"
|
||||||
|
}
|
||||||
|
|
||||||
###################################
|
###################################
|
||||||
### RESPONSE TIME VARIABLES ###
|
### RESPONSE TIME VARIABLES ###
|
||||||
###################################
|
###################################
|
||||||
@ -9,7 +15,7 @@ variable "response_time_appserv_eval_delay" {
|
|||||||
default = 600
|
default = 600
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "response_time_critical_threshold" {
|
variable "response_time_threshold_critical" {
|
||||||
default = 0.8
|
default = 0.8
|
||||||
description = "Alerting threshold in seconds"
|
description = "Alerting threshold in seconds"
|
||||||
}
|
}
|
||||||
@ -24,6 +30,11 @@ variable "response_time_last_time_window_code" {
|
|||||||
description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
|
description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "response_time_require_full_window" {
|
||||||
|
default = false
|
||||||
|
description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
|
||||||
|
}
|
||||||
|
|
||||||
variable "response_time_tags" {
|
variable "response_time_tags" {
|
||||||
default = []
|
default = []
|
||||||
description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
|
description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
|
||||||
@ -39,10 +50,10 @@ variable "response_time_include_tags" {
|
|||||||
description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
|
description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "response_time_notify_no_data" {
|
# variable "response_time_notify_no_data" {
|
||||||
default = true
|
# default = true
|
||||||
description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
|
# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
|
||||||
}
|
# }
|
||||||
|
|
||||||
variable "response_time_renotify_interval" {
|
variable "response_time_renotify_interval" {
|
||||||
default = 0
|
default = 0
|
||||||
@ -76,6 +87,11 @@ variable "memory_usage_last_time_window_code" {
|
|||||||
description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
|
description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "memory_usage_require_full_window" {
|
||||||
|
default = false
|
||||||
|
description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
|
||||||
|
}
|
||||||
|
|
||||||
variable "memory_usage_tags" {
|
variable "memory_usage_tags" {
|
||||||
default = []
|
default = []
|
||||||
description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
|
description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
|
||||||
@ -91,10 +107,10 @@ variable "memory_usage_include_tags" {
|
|||||||
description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
|
description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "memory_usage_notify_no_data" {
|
# variable "memory_usage_notify_no_data" {
|
||||||
default = true
|
# default = true
|
||||||
description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
|
# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
|
||||||
}
|
# }
|
||||||
|
|
||||||
variable "memory_usage_renotify_interval" {
|
variable "memory_usage_renotify_interval" {
|
||||||
default = 0
|
default = 0
|
||||||
@ -105,3 +121,125 @@ variable "memory_usage_escalation_message" {
|
|||||||
default = "Escalation message @pagerduty"
|
default = "Escalation message @pagerduty"
|
||||||
description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
|
description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#################################
|
||||||
|
### HTTP 404 status pages ###
|
||||||
|
#################################
|
||||||
|
variable "http_404_errors_count_rate_limit" {
|
||||||
|
default = 30
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_404_errors_count_rate_appserv_eval_delay" {
|
||||||
|
default = 600
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_404_errors_count_rate_threshold_critical" {
|
||||||
|
default = 30
|
||||||
|
description = "Alerting threshold (number of requests)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_404_errors_count_rate_threshold_warning" {
|
||||||
|
default = 10
|
||||||
|
description = "Warning threshold (number of requests)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_404_errors_count_rate_last_time_window_code" {
|
||||||
|
default = "5m"
|
||||||
|
description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_404_errors_count_rate_require_full_window" {
|
||||||
|
default = true
|
||||||
|
description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_404_errors_count_rate_tags" {
|
||||||
|
default = []
|
||||||
|
description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_404_errors_count_rate_timeout_h" {
|
||||||
|
default = false
|
||||||
|
description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_404_errors_count_rate_include_tags" {
|
||||||
|
default = false
|
||||||
|
description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
|
||||||
|
}
|
||||||
|
|
||||||
|
# variable "http_404_errors_count_rate_notify_no_data" {
|
||||||
|
# default = true
|
||||||
|
# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
|
||||||
|
# }
|
||||||
|
|
||||||
|
variable "http_404_errors_count_rate_renotify_interval" {
|
||||||
|
default = 0
|
||||||
|
description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_404_errors_count_rate_escalation_message" {
|
||||||
|
default = "Escalation message @pagerduty"
|
||||||
|
description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
|
||||||
|
}
|
||||||
|
|
||||||
|
#################################
|
||||||
|
### HTTP 202 status pages ###
|
||||||
|
#################################
|
||||||
|
variable "http_2xx_status_rate_limit" {
|
||||||
|
default = 30
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_2xx_status_rate_appserv_eval_delay" {
|
||||||
|
default = 600
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_2xx_status_rate_threshold_critical" {
|
||||||
|
default = 0.9
|
||||||
|
description = "Alerting threshold (percentage)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_2xx_status_rate_threshold_warning" {
|
||||||
|
default = 0.95
|
||||||
|
description = "Warning threshold (percentage)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_2xx_status_rate_last_time_window_code" {
|
||||||
|
default = "5m"
|
||||||
|
description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_2xx_status_rate_require_full_window" {
|
||||||
|
default = true
|
||||||
|
description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise."
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_2xx_status_rate_tags" {
|
||||||
|
default = []
|
||||||
|
description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_2xx_status_rate_timeout_h" {
|
||||||
|
default = false
|
||||||
|
description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false."
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_2xx_status_rate_include_tags" {
|
||||||
|
default = false
|
||||||
|
description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true."
|
||||||
|
}
|
||||||
|
|
||||||
|
# variable "http_2xx_status_rate_notify_no_data" {
|
||||||
|
# default = true
|
||||||
|
# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true."
|
||||||
|
# }
|
||||||
|
|
||||||
|
variable "http_2xx_status_rate_renotify_interval" {
|
||||||
|
default = 0
|
||||||
|
description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved."
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "http_2xx_status_rate_escalation_message" {
|
||||||
|
default = "Escalation message @pagerduty"
|
||||||
|
description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere."
|
||||||
|
}
|
||||||
|
|||||||
@ -1,30 +1,31 @@
|
|||||||
# Monitoring App Services response time
|
# Monitoring App Services response time
|
||||||
resource "datadog_monitor" "appservices_reponse_time" {
|
resource "datadog_monitor" "appservices_response_time" {
|
||||||
name = "[${var.environment}] App Services response time {{value}}s is above ${var.reponse_time_threshold_critical}s"
|
name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s"
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
|
message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
|
||||||
escalation_message = "${var.reponse_time_escalation_message}"
|
escalation_message = "${var.response_time_escalation_message}"
|
||||||
|
|
||||||
query = "avg(last_${var.reponse_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.reponse_time_threshold_critical}"
|
query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.response_time_threshold_critical}"
|
||||||
|
|
||||||
evaluation_delay = "${var.reponse_time_appserv_eval_delay}"
|
evaluation_delay = "${var.response_time_appserv_eval_delay}"
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.reponse_time_threshold_warning}"
|
warning = "${var.response_time_threshold_warning}"
|
||||||
critical = "${var.reponse_time_threshold_critical}"
|
critical = "${var.response_time_threshold_critical}"
|
||||||
}
|
}
|
||||||
|
|
||||||
notify_no_data = "${var.reponse_time_notify_no_data}"
|
notify_no_data = true # Will notify when no data is received
|
||||||
renotify_interval = "${var.reponse_time_renotify_interval}"
|
renotify_interval = "${var.response_time_renotify_interval}"
|
||||||
|
require_full_window = "${var.response_time_require_full_window}"
|
||||||
|
|
||||||
timeout_h = "${var.reponse_time_timeout_h}"
|
timeout_h = "${var.response_time_timeout_h}"
|
||||||
include_tags = "${var.reponse_time_include_tags}"
|
include_tags = "${var.response_time_include_tags}"
|
||||||
|
|
||||||
tags = "${var.reponse_time_tags}"
|
tags = "${var.response_time_tags}"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Monitoring App Services memory usage
|
# Monitoring App Services memory usage
|
||||||
resource "datadog_monitor" "appservices_memory_usage" {
|
resource "datadog_monitor" "appservices_memory_usage_count" {
|
||||||
name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB"
|
name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB"
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
|
message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
|
||||||
@ -39,11 +40,63 @@ resource "datadog_monitor" "appservices_memory_usage" {
|
|||||||
critical = "${var.memory_usage_threshold_critical}"
|
critical = "${var.memory_usage_threshold_critical}"
|
||||||
}
|
}
|
||||||
|
|
||||||
notify_no_data = "${var.memory_usage_notify_no_data}"
|
notify_no_data = true # Will notify when no data is received
|
||||||
renotify_interval = "${var.memory_usage_renotify_interval}"
|
renotify_interval = "${var.memory_usage_renotify_interval}"
|
||||||
|
require_full_window = "${var.memory_usage_require_full_window}"
|
||||||
|
|
||||||
timeout_h = "${var.memory_usage_timeout_h}"
|
timeout_h = "${var.memory_usage_timeout_h}"
|
||||||
include_tags = "${var.memory_usage_include_tags}"
|
include_tags = "${var.memory_usage_include_tags}"
|
||||||
|
|
||||||
tags = "${var.memory_usage_tags}"
|
tags = "${var.memory_usage_tags}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Monitoring App Services 404 errors rate
|
||||||
|
resource "datadog_monitor" "appservices_http_404_errors_count" {
|
||||||
|
name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit"
|
||||||
|
type = "query alert"
|
||||||
|
message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
|
||||||
|
escalation_message = "${var.http_404_errors_count_rate_escalation_message}"
|
||||||
|
|
||||||
|
query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{*}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}"
|
||||||
|
|
||||||
|
evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.http_404_errors_count_rate_threshold_warning}"
|
||||||
|
critical = "${var.http_404_errors_count_rate_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false # Will NOT notify when no data is received
|
||||||
|
renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}"
|
||||||
|
require_full_window = true
|
||||||
|
|
||||||
|
timeout_h = "${var.http_404_errors_count_rate_timeout_h}"
|
||||||
|
include_tags = "${var.http_404_errors_count_rate_include_tags}"
|
||||||
|
|
||||||
|
tags = "${var.http_404_errors_count_rate_tags}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Monitoring App Services HTTP 2xx status pages rate
|
||||||
|
resource "datadog_monitor" "appservices_http_2xx_status_rate" {
|
||||||
|
name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests"
|
||||||
|
type = "query alert"
|
||||||
|
message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}"
|
||||||
|
escalation_message = "${var.http_2xx_status_rate_escalation_message}"
|
||||||
|
|
||||||
|
query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}"
|
||||||
|
evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.http_2xx_status_rate_threshold_warning}"
|
||||||
|
critical = "${var.http_2xx_status_rate_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = true # Will notify when no data is received
|
||||||
|
renotify_interval = "${var.http_2xx_status_rate_renotify_interval}"
|
||||||
|
require_full_window = true
|
||||||
|
|
||||||
|
timeout_h = "${var.http_2xx_status_rate_timeout_h}"
|
||||||
|
include_tags = "${var.http_2xx_status_rate_include_tags}"
|
||||||
|
|
||||||
|
tags = "${var.http_2xx_status_rate_tags}"
|
||||||
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user