From 6c5bdaa042c11f2a4217b3b3179f1001d407ea56 Mon Sep 17 00:00:00 2001 From: Marc-Antoine ADELISE Date: Mon, 30 Oct 2017 16:32:09 +0100 Subject: [PATCH 01/10] MON-74: Added first Azure App Services resources --- cloud/azure/app-services/inputs.tf | 107 ++++++++++++++++++ .../app-services/monitors-app_services.tf | 49 ++++++++ 2 files changed, 156 insertions(+) create mode 100644 cloud/azure/app-services/inputs.tf create mode 100644 cloud/azure/app-services/monitors-app_services.tf diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf new file mode 100644 index 0000000..830fcc2 --- /dev/null +++ b/cloud/azure/app-services/inputs.tf @@ -0,0 +1,107 @@ +variable "filter_tags" { + default = "*" +} + +################################### +### RESPONSE TIME VARIABLES ### +################################### +variable "response_time_appserv_eval_delay" { + default = 600 +} + +variable "response_time_critical_threshold" { + default = 0.8 + description = "Alerting threshold in seconds" +} + +variable "response_time_threshold_warning" { + default = 0.4 + description = "Warning threshold in seconds" +} + +variable "response_time_last_time_window_code" { + default = "1h" + description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" +} + +variable "response_time_tags" { + default = [] + description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" +} + +variable "response_time_timeout_h" { + default = false + description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." +} + +variable "response_time_include_tags" { + default = false + description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." +} + +variable "response_time_notify_no_data" { + default = true + description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +} + +variable "response_time_renotify_interval" { + default = 0 + description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." +} + +variable "response_time_escalation_message" { + default = "Escalation message @pagerduty" + description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." +} + +################################### +### MEMORY USAGE VARIABLES ### +################################### +variable "memory_usage_appserv_eval_delay" { + default = 600 +} + +variable "memory_usage_threshold_critical" { + default = 52430000 + description = "Alerting threshold in Mib" +} + +variable "memory_usage_threshold_warning" { + default = 33550000 + description = "Warning threshold in MiB" +} + +variable "memory_usage_last_time_window_code" { + default = "5m" + description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" +} + +variable "memory_usage_tags" { + default = [] + description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" +} + +variable "memory_usage_timeout_h" { + default = false + description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." +} + +variable "memory_usage_include_tags" { + default = false + description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." +} + +variable "memory_usage_notify_no_data" { + default = true + description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +} + +variable "memory_usage_renotify_interval" { + default = 0 + description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." +} + +variable "memory_usage_escalation_message" { + default = "Escalation message @pagerduty" + description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." +} diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf new file mode 100644 index 0000000..7bf1f99 --- /dev/null +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -0,0 +1,49 @@ +# Monitoring App Services response time +resource "datadog_monitor" "appservices_reponse_time" { + name = "[${var.environment}] App Services response time {{value}}s is above ${var.reponse_time_threshold_critical}s" + type = "query alert" + message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + escalation_message = "${var.reponse_time_escalation_message}" + + query = "avg(last_${var.reponse_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.reponse_time_threshold_critical}" + + evaluation_delay = "${var.reponse_time_appserv_eval_delay}" + + thresholds { + warning = "${var.reponse_time_threshold_warning}" + critical = "${var.reponse_time_threshold_critical}" + } + + notify_no_data = "${var.reponse_time_notify_no_data}" + renotify_interval = "${var.reponse_time_renotify_interval}" + + timeout_h = "${var.reponse_time_timeout_h}" + include_tags = "${var.reponse_time_include_tags}" + + tags = "${var.reponse_time_tags}" +} + +# Monitoring App Services memory usage +resource "datadog_monitor" "appservices_memory_usage" { + name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" + type = "query alert" + message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + escalation_message = "${var.memory_usage_escalation_message}" + + query = "avg(last_${var.memory_usage_last_time_window_code}):avg:azure.app_services.memory_working_set{*} >= ${var.memory_usage_threshold_critical}" + + evaluation_delay = "${var.memory_usage_appserv_eval_delay}" + + thresholds { + warning = "${var.memory_usage_threshold_warning}" + critical = "${var.memory_usage_threshold_critical}" + } + + notify_no_data = "${var.memory_usage_notify_no_data}" + renotify_interval = "${var.memory_usage_renotify_interval}" + + timeout_h = "${var.memory_usage_timeout_h}" + include_tags = "${var.memory_usage_include_tags}" + + tags = "${var.memory_usage_tags}" +} From 81df985f3297bcf3e993fef42f3a98146339bce0 Mon Sep 17 00:00:00 2001 From: Marc-Antoine ADELISE Date: Tue, 31 Oct 2017 10:08:19 +0100 Subject: [PATCH 02/10] MON-74: Response time, memory usage, http 404 status code and non 2xx http response status code percentage monitoring. --- cloud/azure/app-services/inputs.tf | 158 ++++++++++++++++-- .../app-services/monitors-app_services.tf | 81 +++++++-- 2 files changed, 215 insertions(+), 24 deletions(-) diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 830fcc2..666a394 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -1,7 +1,13 @@ -variable "filter_tags" { +variable "environment" {} + +variable "use_filter_tags" { default = "*" } +variable "critical_escalation_group" { + default = "HO_Dummy" +} + ################################### ### RESPONSE TIME VARIABLES ### ################################### @@ -9,7 +15,7 @@ variable "response_time_appserv_eval_delay" { default = 600 } -variable "response_time_critical_threshold" { +variable "response_time_threshold_critical" { default = 0.8 description = "Alerting threshold in seconds" } @@ -24,6 +30,11 @@ variable "response_time_last_time_window_code" { description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" } +variable "response_time_require_full_window" { + default = false + description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." +} + variable "response_time_tags" { default = [] description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" @@ -39,10 +50,10 @@ variable "response_time_include_tags" { description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." } -variable "response_time_notify_no_data" { - default = true - description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -} +# variable "response_time_notify_no_data" { +# default = true +# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +# } variable "response_time_renotify_interval" { default = 0 @@ -76,6 +87,11 @@ variable "memory_usage_last_time_window_code" { description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" } +variable "memory_usage_require_full_window" { + default = false + description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." +} + variable "memory_usage_tags" { default = [] description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" @@ -91,10 +107,10 @@ variable "memory_usage_include_tags" { description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." } -variable "memory_usage_notify_no_data" { - default = true - description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -} +# variable "memory_usage_notify_no_data" { +# default = true +# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +# } variable "memory_usage_renotify_interval" { default = 0 @@ -105,3 +121,125 @@ variable "memory_usage_escalation_message" { default = "Escalation message @pagerduty" description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." } + +################################# +### HTTP 404 status pages ### +################################# +variable "http_404_errors_count_rate_limit" { + default = 30 +} + +variable "http_404_errors_count_rate_appserv_eval_delay" { + default = 600 +} + +variable "http_404_errors_count_rate_threshold_critical" { + default = 30 + description = "Alerting threshold (number of requests)" +} + +variable "http_404_errors_count_rate_threshold_warning" { + default = 10 + description = "Warning threshold (number of requests)" +} + +variable "http_404_errors_count_rate_last_time_window_code" { + default = "5m" + description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" +} + +variable "http_404_errors_count_rate_require_full_window" { + default = true + description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." +} + +variable "http_404_errors_count_rate_tags" { + default = [] + description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" +} + +variable "http_404_errors_count_rate_timeout_h" { + default = false + description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." +} + +variable "http_404_errors_count_rate_include_tags" { + default = false + description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." +} + +# variable "http_404_errors_count_rate_notify_no_data" { +# default = true +# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +# } + +variable "http_404_errors_count_rate_renotify_interval" { + default = 0 + description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." +} + +variable "http_404_errors_count_rate_escalation_message" { + default = "Escalation message @pagerduty" + description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." +} + +################################# +### HTTP 202 status pages ### +################################# +variable "http_2xx_status_rate_limit" { + default = 30 +} + +variable "http_2xx_status_rate_appserv_eval_delay" { + default = 600 +} + +variable "http_2xx_status_rate_threshold_critical" { + default = 0.9 + description = "Alerting threshold (percentage)" +} + +variable "http_2xx_status_rate_threshold_warning" { + default = 0.95 + description = "Warning threshold (percentage)" +} + +variable "http_2xx_status_rate_last_time_window_code" { + default = "5m" + description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" +} + +variable "http_2xx_status_rate_require_full_window" { + default = true + description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." +} + +variable "http_2xx_status_rate_tags" { + default = [] + description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" +} + +variable "http_2xx_status_rate_timeout_h" { + default = false + description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." +} + +variable "http_2xx_status_rate_include_tags" { + default = false + description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." +} + +# variable "http_2xx_status_rate_notify_no_data" { +# default = true +# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +# } + +variable "http_2xx_status_rate_renotify_interval" { + default = 0 + description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." +} + +variable "http_2xx_status_rate_escalation_message" { + default = "Escalation message @pagerduty" + description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." +} diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 7bf1f99..892b2c4 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -1,30 +1,31 @@ # Monitoring App Services response time -resource "datadog_monitor" "appservices_reponse_time" { - name = "[${var.environment}] App Services response time {{value}}s is above ${var.reponse_time_threshold_critical}s" +resource "datadog_monitor" "appservices_response_time" { + name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" type = "query alert" message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" - escalation_message = "${var.reponse_time_escalation_message}" + escalation_message = "${var.response_time_escalation_message}" - query = "avg(last_${var.reponse_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.reponse_time_threshold_critical}" + query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.response_time_threshold_critical}" - evaluation_delay = "${var.reponse_time_appserv_eval_delay}" + evaluation_delay = "${var.response_time_appserv_eval_delay}" thresholds { - warning = "${var.reponse_time_threshold_warning}" - critical = "${var.reponse_time_threshold_critical}" + warning = "${var.response_time_threshold_warning}" + critical = "${var.response_time_threshold_critical}" } - notify_no_data = "${var.reponse_time_notify_no_data}" - renotify_interval = "${var.reponse_time_renotify_interval}" + notify_no_data = true # Will notify when no data is received + renotify_interval = "${var.response_time_renotify_interval}" + require_full_window = "${var.response_time_require_full_window}" - timeout_h = "${var.reponse_time_timeout_h}" - include_tags = "${var.reponse_time_include_tags}" + timeout_h = "${var.response_time_timeout_h}" + include_tags = "${var.response_time_include_tags}" - tags = "${var.reponse_time_tags}" + tags = "${var.response_time_tags}" } # Monitoring App Services memory usage -resource "datadog_monitor" "appservices_memory_usage" { +resource "datadog_monitor" "appservices_memory_usage_count" { name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" type = "query alert" message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" @@ -39,11 +40,63 @@ resource "datadog_monitor" "appservices_memory_usage" { critical = "${var.memory_usage_threshold_critical}" } - notify_no_data = "${var.memory_usage_notify_no_data}" + notify_no_data = true # Will notify when no data is received renotify_interval = "${var.memory_usage_renotify_interval}" + require_full_window = "${var.memory_usage_require_full_window}" timeout_h = "${var.memory_usage_timeout_h}" include_tags = "${var.memory_usage_include_tags}" tags = "${var.memory_usage_tags}" } + +# Monitoring App Services 404 errors rate +resource "datadog_monitor" "appservices_http_404_errors_count" { + name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit" + type = "query alert" + message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + escalation_message = "${var.http_404_errors_count_rate_escalation_message}" + + query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{*}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}" + + evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}" + + thresholds { + warning = "${var.http_404_errors_count_rate_threshold_warning}" + critical = "${var.http_404_errors_count_rate_threshold_critical}" + } + + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}" + require_full_window = true + + timeout_h = "${var.http_404_errors_count_rate_timeout_h}" + include_tags = "${var.http_404_errors_count_rate_include_tags}" + + tags = "${var.http_404_errors_count_rate_tags}" +} + +# Monitoring App Services HTTP 2xx status pages rate +resource "datadog_monitor" "appservices_http_2xx_status_rate" { + name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests" + type = "query alert" + message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + escalation_message = "${var.http_2xx_status_rate_escalation_message}" + + query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}" + evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}" + + thresholds { + warning = "${var.http_2xx_status_rate_threshold_warning}" + critical = "${var.http_2xx_status_rate_threshold_critical}" + } + + notify_no_data = true # Will notify when no data is received + renotify_interval = "${var.http_2xx_status_rate_renotify_interval}" + require_full_window = true + + timeout_h = "${var.http_2xx_status_rate_timeout_h}" + include_tags = "${var.http_2xx_status_rate_include_tags}" + + tags = "${var.http_2xx_status_rate_tags}" +} From 58bbe0bc7bd08c92c26719b839d185f2e682c54f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 11:18:52 +0100 Subject: [PATCH 03/10] MON-74: fmt --- cloud/azure/app-services/inputs.tf | 72 +++++++++---------- .../app-services/monitors-app_services.tf | 18 ++--- 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 666a394..dc26017 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -16,37 +16,37 @@ variable "response_time_appserv_eval_delay" { } variable "response_time_threshold_critical" { - default = 0.8 + default = 0.8 description = "Alerting threshold in seconds" } variable "response_time_threshold_warning" { - default = 0.4 + default = 0.4 description = "Warning threshold in seconds" } variable "response_time_last_time_window_code" { - default = "1h" + default = "1h" description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" } variable "response_time_require_full_window" { - default = false + default = false description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." } variable "response_time_tags" { - default = [] + default = [] description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" } variable "response_time_timeout_h" { - default = false + default = false description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } variable "response_time_include_tags" { - default = false + default = false description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." } @@ -56,12 +56,12 @@ variable "response_time_include_tags" { # } variable "response_time_renotify_interval" { - default = 0 + default = 0 description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." } variable "response_time_escalation_message" { - default = "Escalation message @pagerduty" + default = "Escalation message @pagerduty" description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." } @@ -73,37 +73,37 @@ variable "memory_usage_appserv_eval_delay" { } variable "memory_usage_threshold_critical" { - default = 52430000 + default = 52430000 description = "Alerting threshold in Mib" } variable "memory_usage_threshold_warning" { - default = 33550000 + default = 33550000 description = "Warning threshold in MiB" } variable "memory_usage_last_time_window_code" { - default = "5m" + default = "5m" description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" } variable "memory_usage_require_full_window" { - default = false + default = false description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." } variable "memory_usage_tags" { - default = [] + default = [] description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" } variable "memory_usage_timeout_h" { - default = false + default = false description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } variable "memory_usage_include_tags" { - default = false + default = false description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." } @@ -113,12 +113,12 @@ variable "memory_usage_include_tags" { # } variable "memory_usage_renotify_interval" { - default = 0 + default = 0 description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." } variable "memory_usage_escalation_message" { - default = "Escalation message @pagerduty" + default = "Escalation message @pagerduty" description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." } @@ -134,37 +134,37 @@ variable "http_404_errors_count_rate_appserv_eval_delay" { } variable "http_404_errors_count_rate_threshold_critical" { - default = 30 + default = 30 description = "Alerting threshold (number of requests)" } variable "http_404_errors_count_rate_threshold_warning" { - default = 10 + default = 10 description = "Warning threshold (number of requests)" } variable "http_404_errors_count_rate_last_time_window_code" { - default = "5m" + default = "5m" description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" } variable "http_404_errors_count_rate_require_full_window" { - default = true + default = true description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." } variable "http_404_errors_count_rate_tags" { - default = [] + default = [] description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" } variable "http_404_errors_count_rate_timeout_h" { - default = false + default = false description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } variable "http_404_errors_count_rate_include_tags" { - default = false + default = false description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." } @@ -174,12 +174,12 @@ variable "http_404_errors_count_rate_include_tags" { # } variable "http_404_errors_count_rate_renotify_interval" { - default = 0 + default = 0 description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." } variable "http_404_errors_count_rate_escalation_message" { - default = "Escalation message @pagerduty" + default = "Escalation message @pagerduty" description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." } @@ -195,37 +195,37 @@ variable "http_2xx_status_rate_appserv_eval_delay" { } variable "http_2xx_status_rate_threshold_critical" { - default = 0.9 + default = 0.9 description = "Alerting threshold (percentage)" } variable "http_2xx_status_rate_threshold_warning" { - default = 0.95 + default = 0.95 description = "Warning threshold (percentage)" } variable "http_2xx_status_rate_last_time_window_code" { - default = "5m" + default = "5m" description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" } variable "http_2xx_status_rate_require_full_window" { - default = true + default = true description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." } variable "http_2xx_status_rate_tags" { - default = [] + default = [] description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" } variable "http_2xx_status_rate_timeout_h" { - default = false + default = false description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } variable "http_2xx_status_rate_include_tags" { - default = false + default = false description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." } @@ -235,11 +235,11 @@ variable "http_2xx_status_rate_include_tags" { # } variable "http_2xx_status_rate_renotify_interval" { - default = 0 + default = 0 description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." } variable "http_2xx_status_rate_escalation_message" { - default = "Escalation message @pagerduty" + default = "Escalation message @pagerduty" description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." } diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 892b2c4..3e5f94a 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -14,8 +14,8 @@ resource "datadog_monitor" "appservices_response_time" { critical = "${var.response_time_threshold_critical}" } - notify_no_data = true # Will notify when no data is received - renotify_interval = "${var.response_time_renotify_interval}" + notify_no_data = true # Will notify when no data is received + renotify_interval = "${var.response_time_renotify_interval}" require_full_window = "${var.response_time_require_full_window}" timeout_h = "${var.response_time_timeout_h}" @@ -40,8 +40,8 @@ resource "datadog_monitor" "appservices_memory_usage_count" { critical = "${var.memory_usage_threshold_critical}" } - notify_no_data = true # Will notify when no data is received - renotify_interval = "${var.memory_usage_renotify_interval}" + notify_no_data = true # Will notify when no data is received + renotify_interval = "${var.memory_usage_renotify_interval}" require_full_window = "${var.memory_usage_require_full_window}" timeout_h = "${var.memory_usage_timeout_h}" @@ -66,8 +66,8 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { critical = "${var.http_404_errors_count_rate_threshold_critical}" } - notify_no_data = false # Will NOT notify when no data is received - renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}" + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}" require_full_window = true timeout_h = "${var.http_404_errors_count_rate_timeout_h}" @@ -83,7 +83,7 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" escalation_message = "${var.http_2xx_status_rate_escalation_message}" - query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}" + query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}" evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}" thresholds { @@ -91,8 +91,8 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { critical = "${var.http_2xx_status_rate_threshold_critical}" } - notify_no_data = true # Will notify when no data is received - renotify_interval = "${var.http_2xx_status_rate_renotify_interval}" + notify_no_data = true # Will notify when no data is received + renotify_interval = "${var.http_2xx_status_rate_renotify_interval}" require_full_window = true timeout_h = "${var.http_2xx_status_rate_timeout_h}" From 4c9bc13de0ae6365d94a3a3d311a8f3339b5bd09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 11:22:18 +0100 Subject: [PATCH 04/10] MON-74: Use filter tags option --- cloud/azure/app-services/inputs.tf | 3 ++- .../app-services/monitors-app_services.tf | 19 ++++++++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index dc26017..8af09cb 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -1,7 +1,8 @@ variable "environment" {} variable "use_filter_tags" { - default = "*" + description = "Filter the data with service tags if true" + default = "true" } variable "critical_escalation_group" { diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 3e5f94a..48b8184 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -1,3 +1,11 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_appservices:enabled,env:%s", var.environment) : "*"}" + } +} + # Monitoring App Services response time resource "datadog_monitor" "appservices_response_time" { name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" @@ -5,7 +13,7 @@ resource "datadog_monitor" "appservices_response_time" { message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" escalation_message = "${var.response_time_escalation_message}" - query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.response_time_threshold_critical}" + query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}} >= ${var.response_time_threshold_critical}" evaluation_delay = "${var.response_time_appserv_eval_delay}" @@ -31,7 +39,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" { message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" escalation_message = "${var.memory_usage_escalation_message}" - query = "avg(last_${var.memory_usage_last_time_window_code}):avg:azure.app_services.memory_working_set{*} >= ${var.memory_usage_threshold_critical}" + query = "avg(last_${var.memory_usage_last_time_window_code}):avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}} >= ${var.memory_usage_threshold_critical}" evaluation_delay = "${var.memory_usage_appserv_eval_delay}" @@ -57,7 +65,7 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" escalation_message = "${var.http_404_errors_count_rate_escalation_message}" - query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{*}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}" + query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}" evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}" @@ -83,7 +91,7 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" escalation_message = "${var.http_2xx_status_rate_escalation_message}" - query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}" + query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() / avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() < ${var.http_2xx_status_rate_threshold_critical}" evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}" thresholds { @@ -91,7 +99,8 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { critical = "${var.http_2xx_status_rate_threshold_critical}" } - notify_no_data = true # Will notify when no data is received + # Will notify when no data is received + notify_no_data = true renotify_interval = "${var.http_2xx_status_rate_renotify_interval}" require_full_window = true From ac96ee6586a2800c13dde2b25ac456c5d695d15b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 11:24:08 +0100 Subject: [PATCH 05/10] MON-74: Uses generic message parameter --- cloud/azure/app-services/inputs.tf | 12 ++++++++---- cloud/azure/app-services/monitors-app_services.tf | 8 ++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 8af09cb..4ad908b 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -1,19 +1,23 @@ -variable "environment" {} +variable "environment" { + description = "Architecture environment" + type = "string" +} variable "use_filter_tags" { description = "Filter the data with service tags if true" default = "true" } -variable "critical_escalation_group" { - default = "HO_Dummy" +variable "message" { + description = "Message sent when a monitor is triggered" } ################################### ### RESPONSE TIME VARIABLES ### ################################### variable "response_time_appserv_eval_delay" { - default = 600 + description = "Delay in seconds for the metric evaluation" + default = 600 } variable "response_time_threshold_critical" { diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 48b8184..9447cb4 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -10,7 +10,7 @@ data "template_file" "filter" { resource "datadog_monitor" "appservices_response_time" { name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" type = "query alert" - message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + message = "${var.message}" escalation_message = "${var.response_time_escalation_message}" query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}} >= ${var.response_time_threshold_critical}" @@ -36,7 +36,7 @@ resource "datadog_monitor" "appservices_response_time" { resource "datadog_monitor" "appservices_memory_usage_count" { name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" type = "query alert" - message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + message = "${var.message}" escalation_message = "${var.memory_usage_escalation_message}" query = "avg(last_${var.memory_usage_last_time_window_code}):avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}} >= ${var.memory_usage_threshold_critical}" @@ -62,7 +62,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" { resource "datadog_monitor" "appservices_http_404_errors_count" { name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit" type = "query alert" - message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + message = "${var.message}" escalation_message = "${var.http_404_errors_count_rate_escalation_message}" query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}" @@ -88,7 +88,7 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { resource "datadog_monitor" "appservices_http_2xx_status_rate" { name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests" type = "query alert" - message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + message = "${var.message}" escalation_message = "${var.http_2xx_status_rate_escalation_message}" query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() / avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() < ${var.http_2xx_status_rate_threshold_critical}" From 31e036a8055c1404cb6b74808a701652fef42c04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 11:27:09 +0100 Subject: [PATCH 06/10] MON-74: Readme --- cloud/azure/app-services/README.md | 83 ++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 cloud/azure/app-services/README.md diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md new file mode 100644 index 0000000..443c819 --- /dev/null +++ b/cloud/azure/app-services/README.md @@ -0,0 +1,83 @@ +Azure AppServices (Web, API, Functions) DataDog monitors +======================================================== + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-app-services" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/app-services?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + + environment = "${var.environment}" + client_name = "${var.client_name}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* Response time +* Memory usage count +* HTTP 404 errors +* HTTP 50x errors +* HTTP 20x rate + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| environment | Architecture environment | string | - | yes | +| http_2xx_status_rate_appserv_eval_delay | | string | `600` | no | +| http_2xx_status_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| http_2xx_status_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| http_2xx_status_rate_limit | ################################ ## HTTP 202 status pages ### ################################ | string | `30` | no | +| http_2xx_status_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | +| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | +| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | +| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| http_404_errors_count_rate_appserv_eval_delay | | string | `600` | no | +| http_404_errors_count_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| http_404_errors_count_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| http_404_errors_count_rate_limit | ################################ ## HTTP 404 status pages ### ################################ | string | `30` | no | +| http_404_errors_count_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | +| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | +| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | +| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| memory_usage_appserv_eval_delay | ################################## ## MEMORY USAGE VARIABLES ### ################################## | string | `600` | no | +| memory_usage_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| memory_usage_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| memory_usage_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | +| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | +| memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | +| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| response_time_appserv_eval_delay | Delay in seconds for the metric evaluation | string | `600` | no | +| response_time_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| response_time_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no | +| response_time_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | +| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | +| response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | +| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| use_filter_tags | Filter the data with service tags if true | string | `true` | no | + + +Related documentation +--------------------- + +DataDog documentation: https://docs.datadoghq.com/integrations/azure_app_services From 98f5b6f331f381b9c7300f12036ac34320d0a718 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 11:31:55 +0100 Subject: [PATCH 07/10] MON-74: Readme update --- cloud/azure/app-services/README.md | 165 ++++++++++++++--------------- cloud/azure/app-services/inputs.tf | 4 + 2 files changed, 86 insertions(+), 83 deletions(-) diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index 443c819..d8a02c7 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -1,83 +1,82 @@ -Azure AppServices (Web, API, Functions) DataDog monitors -======================================================== - -How to use this module ----------------------- - -``` -module "datadog-monitors-azure-app-services" { - source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/app-services?ref={revision}" - - message = "${module.datadog-message-alerting.alerting-message}" - - environment = "${var.environment}" - client_name = "${var.client_name}" -} -``` - -Purpose -------- -Creates a DataDog monitors with the following checks : - -* Response time -* Memory usage count -* HTTP 404 errors -* HTTP 50x errors -* HTTP 20x rate - -Inputs ------- - -| Name | Description | Type | Default | Required | -|------|-------------|:----:|:-----:|:-----:| -| environment | Architecture environment | string | - | yes | -| http_2xx_status_rate_appserv_eval_delay | | string | `600` | no | -| http_2xx_status_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| http_2xx_status_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | -| http_2xx_status_rate_limit | ################################ ## HTTP 202 status pages ### ################################ | string | `30` | no | -| http_2xx_status_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | -| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | -| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | -| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | -| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | -| http_404_errors_count_rate_appserv_eval_delay | | string | `600` | no | -| http_404_errors_count_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| http_404_errors_count_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | -| http_404_errors_count_rate_limit | ################################ ## HTTP 404 status pages ### ################################ | string | `30` | no | -| http_404_errors_count_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | -| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | -| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | -| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | -| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | -| memory_usage_appserv_eval_delay | ################################## ## MEMORY USAGE VARIABLES ### ################################## | string | `600` | no | -| memory_usage_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| memory_usage_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | -| memory_usage_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | -| memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | -| memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | -| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | -| message | Message sent when a monitor is triggered | string | - | yes | -| response_time_appserv_eval_delay | Delay in seconds for the metric evaluation | string | `600` | no | -| response_time_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| response_time_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no | -| response_time_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | -| response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | -| response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | -| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | -| use_filter_tags | Filter the data with service tags if true | string | `true` | no | - - -Related documentation ---------------------- - -DataDog documentation: https://docs.datadoghq.com/integrations/azure_app_services +Azure AppServices (Web, API, Functions) DataDog monitors +======================================================== + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-app-services" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/app-services?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + + environment = "${var.environment}" + client_name = "${var.client_name}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* Response time +* Memory usage count +* HTTP 404 errors +* HTTP 50x errors +* HTTP 20x rate + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| environment | Architecture environment | string | - | yes | +| http_2xx_status_rate_appserv_eval_delay | | string | `600` | no | +| http_2xx_status_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| http_2xx_status_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| http_2xx_status_rate_limit | | string | `30` | no | +| http_2xx_status_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | +| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | +| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | +| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| http_404_errors_count_rate_appserv_eval_delay | | string | `600` | no | +| http_404_errors_count_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| http_404_errors_count_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| http_404_errors_count_rate_limit | | string | `30` | no | +| http_404_errors_count_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | +| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | +| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | +| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| memory_usage_appserv_eval_delay | | string | `600` | no | +| memory_usage_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| memory_usage_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| memory_usage_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | +| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | +| memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | +| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| response_time_appserv_eval_delay | Delay in seconds for the metric evaluation | string | `600` | no | +| response_time_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| response_time_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no | +| response_time_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | +| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | +| response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | +| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| use_filter_tags | Filter the data with service tags if true | string | `true` | no | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/azure_app_services](https://docs.datadoghq.com/integrations/azure_app_services) diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 4ad908b..4f2a693 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -15,6 +15,7 @@ variable "message" { ################################### ### RESPONSE TIME VARIABLES ### ################################### + variable "response_time_appserv_eval_delay" { description = "Delay in seconds for the metric evaluation" default = 600 @@ -73,6 +74,7 @@ variable "response_time_escalation_message" { ################################### ### MEMORY USAGE VARIABLES ### ################################### + variable "memory_usage_appserv_eval_delay" { default = 600 } @@ -130,6 +132,7 @@ variable "memory_usage_escalation_message" { ################################# ### HTTP 404 status pages ### ################################# + variable "http_404_errors_count_rate_limit" { default = 30 } @@ -191,6 +194,7 @@ variable "http_404_errors_count_rate_escalation_message" { ################################# ### HTTP 202 status pages ### ################################# + variable "http_2xx_status_rate_limit" { default = 30 } From dc06fb9519175c55c6d12b60ceea20a71ac4af0e Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 31 Oct 2017 15:28:41 +0100 Subject: [PATCH 08/10] MON-74 Add EOF on querys --- .../app-services/monitors-app_services.tf | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 9447cb4..c42ad6c 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -13,7 +13,11 @@ resource "datadog_monitor" "appservices_response_time" { message = "${var.message}" escalation_message = "${var.response_time_escalation_message}" - query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}} >= ${var.response_time_threshold_critical}" + query = <= ${var.response_time_threshold_critical} + EOF evaluation_delay = "${var.response_time_appserv_eval_delay}" @@ -39,7 +43,11 @@ resource "datadog_monitor" "appservices_memory_usage_count" { message = "${var.message}" escalation_message = "${var.memory_usage_escalation_message}" - query = "avg(last_${var.memory_usage_last_time_window_code}):avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}} >= ${var.memory_usage_threshold_critical}" + query = <= ${var.memory_usage_threshold_critical} + EOF evaluation_delay = "${var.memory_usage_appserv_eval_delay}" @@ -65,7 +73,11 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { message = "${var.message}" escalation_message = "${var.http_404_errors_count_rate_escalation_message}" - query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}" + query = < ${var.http_404_errors_count_rate_threshold_critical} + EOF evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}" @@ -91,7 +103,13 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { message = "${var.message}" escalation_message = "${var.http_2xx_status_rate_escalation_message}" - query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() / avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() < ${var.http_2xx_status_rate_threshold_critical}" + query = < Date: Thu, 2 Nov 2017 16:54:18 +0100 Subject: [PATCH 09/10] MON-74 Fix changes to fit as the other modules --- cloud/azure/app-services/README.md | 80 +++++++++-------- cloud/azure/app-services/inputs.tf | 86 +++---------------- .../app-services/monitors-app_services.tf | 60 ++++++------- 3 files changed, 86 insertions(+), 140 deletions(-) diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index d8a02c7..90f5882 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -28,52 +28,64 @@ Creates a DataDog monitors with the following checks : Inputs ------ -| Name | Description | Type | Default | Required | -|------|-------------|:----:|:-----:|:-----:| +| Name | Description | Type | Default | Required | DESKTOP-0PBDRFR: ~ +|------|-------------|:----:|:-----:|:-----:| → +| client_name | Client Name | string | - | yes | +| delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| http_2xx_status_rate_appserv_eval_delay | | string | `600` | no | -| http_2xx_status_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| http_2xx_status_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_# +m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | | http_2xx_status_rate_limit | | string | `30` | no | -| http_2xx_status_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | -| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data bef +ore it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be s +kipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` +| no | +| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter m +onitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying + via the API | string | `` | no | | http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | | http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | -| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | -| http_404_errors_count_rate_appserv_eval_delay | | string | `600` | no | -| http_404_errors_count_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| http_404_errors_count_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically +resolve from a triggered state. Defaults to false. | string | `false` | no | +| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write +last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | | http_404_errors_count_rate_limit | | string | `30` | no | -| http_404_errors_count_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | -| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of da +ta before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations wil +l be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | ` +true` | no | +| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and fi +lter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when qu +erying via the API | string | `` | no | | http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | | http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | -| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | -| memory_usage_appserv_eval_delay | | string | `600` | no | -| memory_usage_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| memory_usage_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | -| memory_usage_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automati +cally resolve from a triggered state. Defaults to false. | string | `false` | no | +| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, + 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's + evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. +Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | +| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors +in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the + API | string | `` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | -| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve +from a triggered state. Defaults to false. | string | `false` | no | | message | Message sent when a monitor is triggered | string | - | yes | -| response_time_appserv_eval_delay | Delay in seconds for the metric evaluation | string | `600` | no | -| response_time_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| response_time_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no | -| response_time_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5 +, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no | +| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it' +s evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. + Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | +| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors + in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via th +e API | string | `` | no | | response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | | response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | -| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve + from a triggered state. Defaults to false. | string | `false` | no | | use_filter_tags | Filter the data with service tags if true | string | `true` | no | Related documentation diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 4f2a693..5f0f2b0 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -3,6 +3,11 @@ variable "environment" { type = "string" } +variable "client_name" { + description = "Client Name" + type = "string" +} + variable "use_filter_tags" { description = "Filter the data with service tags if true" default = "true" @@ -12,15 +17,15 @@ variable "message" { description = "Message sent when a monitor is triggered" } -################################### -### RESPONSE TIME VARIABLES ### -################################### - -variable "response_time_appserv_eval_delay" { +variable "delay" { description = "Delay in seconds for the metric evaluation" default = 600 } +################################### +### RESPONSE TIME VARIABLES ### +################################### + variable "response_time_threshold_critical" { default = 0.8 description = "Alerting threshold in seconds" @@ -51,34 +56,15 @@ variable "response_time_timeout_h" { description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } -variable "response_time_include_tags" { - default = false - description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." -} - # variable "response_time_notify_no_data" { # default = true # description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." # } -variable "response_time_renotify_interval" { - default = 0 - description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." -} - -variable "response_time_escalation_message" { - default = "Escalation message @pagerduty" - description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." -} - ################################### ### MEMORY USAGE VARIABLES ### ################################### -variable "memory_usage_appserv_eval_delay" { - default = 600 -} - variable "memory_usage_threshold_critical" { default = 52430000 description = "Alerting threshold in Mib" @@ -109,26 +95,11 @@ variable "memory_usage_timeout_h" { description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } -variable "memory_usage_include_tags" { - default = false - description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." -} - # variable "memory_usage_notify_no_data" { # default = true # description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." # } -variable "memory_usage_renotify_interval" { - default = 0 - description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." -} - -variable "memory_usage_escalation_message" { - default = "Escalation message @pagerduty" - description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." -} - ################################# ### HTTP 404 status pages ### ################################# @@ -137,10 +108,6 @@ variable "http_404_errors_count_rate_limit" { default = 30 } -variable "http_404_errors_count_rate_appserv_eval_delay" { - default = 600 -} - variable "http_404_errors_count_rate_threshold_critical" { default = 30 description = "Alerting threshold (number of requests)" @@ -171,26 +138,11 @@ variable "http_404_errors_count_rate_timeout_h" { description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } -variable "http_404_errors_count_rate_include_tags" { - default = false - description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." -} - # variable "http_404_errors_count_rate_notify_no_data" { # default = true # description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." # } -variable "http_404_errors_count_rate_renotify_interval" { - default = 0 - description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." -} - -variable "http_404_errors_count_rate_escalation_message" { - default = "Escalation message @pagerduty" - description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." -} - ################################# ### HTTP 202 status pages ### ################################# @@ -199,10 +151,6 @@ variable "http_2xx_status_rate_limit" { default = 30 } -variable "http_2xx_status_rate_appserv_eval_delay" { - default = 600 -} - variable "http_2xx_status_rate_threshold_critical" { default = 0.9 description = "Alerting threshold (percentage)" @@ -233,22 +181,8 @@ variable "http_2xx_status_rate_timeout_h" { description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } -variable "http_2xx_status_rate_include_tags" { - default = false - description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." -} - # variable "http_2xx_status_rate_notify_no_data" { # default = true # description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." # } -variable "http_2xx_status_rate_renotify_interval" { - default = 0 - description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." -} - -variable "http_2xx_status_rate_escalation_message" { - default = "Escalation message @pagerduty" - description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." -} diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index c42ad6c..437b7fb 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -8,10 +8,9 @@ data "template_file" "filter" { # Monitoring App Services response time resource "datadog_monitor" "appservices_response_time" { - name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" - type = "query alert" - message = "${var.message}" - escalation_message = "${var.response_time_escalation_message}" + name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" + type = "query alert" + message = "${var.message}" query = <= ${var.response_time_threshold_critical} EOF - evaluation_delay = "${var.response_time_appserv_eval_delay}" + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" thresholds { warning = "${var.response_time_threshold_warning}" @@ -27,21 +27,20 @@ resource "datadog_monitor" "appservices_response_time" { } notify_no_data = true # Will notify when no data is received - renotify_interval = "${var.response_time_renotify_interval}" + renotify_interval = 0 require_full_window = "${var.response_time_require_full_window}" timeout_h = "${var.response_time_timeout_h}" - include_tags = "${var.response_time_include_tags}" + include_tags = true tags = "${var.response_time_tags}" } # Monitoring App Services memory usage resource "datadog_monitor" "appservices_memory_usage_count" { - name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" - type = "query alert" - message = "${var.message}" - escalation_message = "${var.memory_usage_escalation_message}" + name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" + type = "query alert" + message = "${var.message}" query = <= ${var.memory_usage_threshold_critical} EOF - evaluation_delay = "${var.memory_usage_appserv_eval_delay}" + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" thresholds { warning = "${var.memory_usage_threshold_warning}" @@ -57,21 +57,20 @@ resource "datadog_monitor" "appservices_memory_usage_count" { } notify_no_data = true # Will notify when no data is received - renotify_interval = "${var.memory_usage_renotify_interval}" + renotify_interval = 0 require_full_window = "${var.memory_usage_require_full_window}" timeout_h = "${var.memory_usage_timeout_h}" - include_tags = "${var.memory_usage_include_tags}" + include_tags = true tags = "${var.memory_usage_tags}" } # Monitoring App Services 404 errors rate resource "datadog_monitor" "appservices_http_404_errors_count" { - name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit" - type = "query alert" - message = "${var.message}" - escalation_message = "${var.http_404_errors_count_rate_escalation_message}" + name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit" + type = "query alert" + message = "${var.message}" query = < ${var.http_404_errors_count_rate_threshold_critical} EOF - evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}" + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" thresholds { warning = "${var.http_404_errors_count_rate_threshold_warning}" critical = "${var.http_404_errors_count_rate_threshold_critical}" } - notify_no_data = false # Will NOT notify when no data is received - renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}" + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = 0 require_full_window = true timeout_h = "${var.http_404_errors_count_rate_timeout_h}" - include_tags = "${var.http_404_errors_count_rate_include_tags}" + include_tags = true tags = "${var.http_404_errors_count_rate_tags}" } # Monitoring App Services HTTP 2xx status pages rate resource "datadog_monitor" "appservices_http_2xx_status_rate" { - name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests" - type = "query alert" - message = "${var.message}" - escalation_message = "${var.http_2xx_status_rate_escalation_message}" + name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests" + type = "query alert" + message = "${var.message}" - query = < Date: Thu, 23 Nov 2017 17:52:01 +0100 Subject: [PATCH 10/10] MON-74 Normalize monitors --- cloud/azure/app-services/README.md | 54 +-------- cloud/azure/app-services/inputs.tf | 113 +----------------- .../app-services/monitors-app_services.tf | 63 +++++----- 3 files changed, 40 insertions(+), 190 deletions(-) diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index 90f5882..e56fac2 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -8,10 +8,8 @@ How to use this module module "datadog-monitors-azure-app-services" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/app-services?ref={revision}" - message = "${module.datadog-message-alerting.alerting-message}" - + message = "${module.datadog-message-alerting.alerting-message}" environment = "${var.environment}" - client_name = "${var.client_name}" } ``` @@ -28,65 +26,23 @@ Creates a DataDog monitors with the following checks : Inputs ------ -| Name | Description | Type | Default | Required | DESKTOP-0PBDRFR: ~ -|------|-------------|:----:|:-----:|:-----:| → -| client_name | Client Name | string | - | yes | +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_# -m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | http_2xx_status_rate_limit | | string | `30` | no | -| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data bef -ore it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be s -kipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` -| no | -| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter m -onitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying - via the API | string | `` | no | | http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | | http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | -| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically -resolve from a triggered state. Defaults to false. | string | `false` | no | -| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write -last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | | http_404_errors_count_rate_limit | | string | `30` | no | -| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of da -ta before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations wil -l be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | ` -true` | no | -| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and fi -lter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when qu -erying via the API | string | `` | no | | http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | | http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | -| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automati -cally resolve from a triggered state. Defaults to false. | string | `false` | no | -| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, - 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | -| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's - evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. -Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors -in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the - API | string | `` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | -| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve -from a triggered state. Defaults to false. | string | `false` | no | | message | Message sent when a monitor is triggered | string | - | yes | -| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5 -, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no | -| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it' -s evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. - Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors - in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via th -e API | string | `` | no | | response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | | response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | -| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve - from a triggered state. Defaults to false. | string | `false` | no | -| use_filter_tags | Filter the data with service tags if true | string | `true` | no | Related documentation --------------------- diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 5f0f2b0..c4bc451 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -3,14 +3,14 @@ variable "environment" { type = "string" } -variable "client_name" { - description = "Client Name" - type = "string" +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" } -variable "use_filter_tags" { - description = "Filter the data with service tags if true" - default = "true" +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" } variable "message" { @@ -36,31 +36,6 @@ variable "response_time_threshold_warning" { description = "Warning threshold in seconds" } -variable "response_time_last_time_window_code" { - default = "1h" - description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" -} - -variable "response_time_require_full_window" { - default = false - description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." -} - -variable "response_time_tags" { - default = [] - description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" -} - -variable "response_time_timeout_h" { - default = false - description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." -} - -# variable "response_time_notify_no_data" { -# default = true -# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -# } - ################################### ### MEMORY USAGE VARIABLES ### ################################### @@ -75,31 +50,6 @@ variable "memory_usage_threshold_warning" { description = "Warning threshold in MiB" } -variable "memory_usage_last_time_window_code" { - default = "5m" - description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" -} - -variable "memory_usage_require_full_window" { - default = false - description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." -} - -variable "memory_usage_tags" { - default = [] - description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" -} - -variable "memory_usage_timeout_h" { - default = false - description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." -} - -# variable "memory_usage_notify_no_data" { -# default = true -# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -# } - ################################# ### HTTP 404 status pages ### ################################# @@ -118,31 +68,6 @@ variable "http_404_errors_count_rate_threshold_warning" { description = "Warning threshold (number of requests)" } -variable "http_404_errors_count_rate_last_time_window_code" { - default = "5m" - description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" -} - -variable "http_404_errors_count_rate_require_full_window" { - default = true - description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." -} - -variable "http_404_errors_count_rate_tags" { - default = [] - description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" -} - -variable "http_404_errors_count_rate_timeout_h" { - default = false - description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." -} - -# variable "http_404_errors_count_rate_notify_no_data" { -# default = true -# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -# } - ################################# ### HTTP 202 status pages ### ################################# @@ -160,29 +85,3 @@ variable "http_2xx_status_rate_threshold_warning" { default = 0.95 description = "Warning threshold (percentage)" } - -variable "http_2xx_status_rate_last_time_window_code" { - default = "5m" - description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" -} - -variable "http_2xx_status_rate_require_full_window" { - default = true - description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." -} - -variable "http_2xx_status_rate_tags" { - default = [] - description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" -} - -variable "http_2xx_status_rate_timeout_h" { - default = false - description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." -} - -# variable "http_2xx_status_rate_notify_no_data" { -# default = true -# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -# } - diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 437b7fb..1cff1af 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -2,18 +2,18 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_appservices:enabled,env:%s", var.environment) : "*"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } # Monitoring App Services response time resource "datadog_monitor" "appservices_response_time" { - name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" - type = "query alert" + name = "[${var.environment}] App Services response time > ${var.response_time_threshold_critical}s on {{name}}" + type = "metric alert" message = "${var.message}" query = <= ${var.response_time_threshold_critical} EOF @@ -26,24 +26,23 @@ resource "datadog_monitor" "appservices_response_time" { critical = "${var.response_time_threshold_critical}" } - notify_no_data = true # Will notify when no data is received + notify_no_data = true # Will notify when no data is received renotify_interval = 0 - require_full_window = "${var.response_time_require_full_window}" + require_full_window = true + timeout_h = 0 + include_tags = true - timeout_h = "${var.response_time_timeout_h}" - include_tags = true - - tags = "${var.response_time_tags}" + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } # Monitoring App Services memory usage resource "datadog_monitor" "appservices_memory_usage_count" { - name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" - type = "query alert" + name = "[${var.environment}] App Services memory usage > ${ceil(var.memory_usage_threshold_critical/1000000)}MiB on {{name}}" + type = "metric alert" message = "${var.message}" query = <= ${var.memory_usage_threshold_critical} EOF @@ -58,22 +57,21 @@ resource "datadog_monitor" "appservices_memory_usage_count" { notify_no_data = true # Will notify when no data is received renotify_interval = 0 - require_full_window = "${var.memory_usage_require_full_window}" + require_full_window = true + timeout_h = 0 + include_tags = true - timeout_h = "${var.memory_usage_timeout_h}" - include_tags = true - - tags = "${var.memory_usage_tags}" + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } # Monitoring App Services 404 errors rate resource "datadog_monitor" "appservices_http_404_errors_count" { - name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit" - type = "query alert" + name = "[${var.environment}] App Services HTTP errors > ${var.http_404_errors_count_rate_limit} limit on {{name}}" + type = "metric alert" message = "${var.message}" query = < ${var.http_404_errors_count_rate_threshold_critical} EOF @@ -89,21 +87,20 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { notify_no_data = false # Will NOT notify when no data is received renotify_interval = 0 require_full_window = true + timeout_h = 0 + include_tags = true - timeout_h = "${var.http_404_errors_count_rate_timeout_h}" - include_tags = true - - tags = "${var.http_404_errors_count_rate_tags}" + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } # Monitoring App Services HTTP 2xx status pages rate resource "datadog_monitor" "appservices_http_2xx_status_rate" { - name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests" - type = "query alert" + name = "[${var.environment}] App Services Too much non 2xx HTTP status in response to the requests on {{name}}" + type = "metric alert" message = "${var.message}" query = <