diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index d8a02c7..90f5882 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -28,52 +28,64 @@ Creates a DataDog monitors with the following checks : Inputs ------ -| Name | Description | Type | Default | Required | -|------|-------------|:----:|:-----:|:-----:| +| Name | Description | Type | Default | Required | DESKTOP-0PBDRFR: ~ +|------|-------------|:----:|:-----:|:-----:| → +| client_name | Client Name | string | - | yes | +| delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| http_2xx_status_rate_appserv_eval_delay | | string | `600` | no | -| http_2xx_status_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| http_2xx_status_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_# +m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | | http_2xx_status_rate_limit | | string | `30` | no | -| http_2xx_status_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | -| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data bef +ore it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be s +kipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` +| no | +| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter m +onitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying + via the API | string | `` | no | | http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | | http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | -| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | -| http_404_errors_count_rate_appserv_eval_delay | | string | `600` | no | -| http_404_errors_count_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| http_404_errors_count_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically +resolve from a triggered state. Defaults to false. | string | `false` | no | +| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write +last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | | http_404_errors_count_rate_limit | | string | `30` | no | -| http_404_errors_count_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | -| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of da +ta before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations wil +l be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | ` +true` | no | +| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and fi +lter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when qu +erying via the API | string | `` | no | | http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | | http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | -| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | -| memory_usage_appserv_eval_delay | | string | `600` | no | -| memory_usage_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| memory_usage_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | -| memory_usage_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automati +cally resolve from a triggered state. Defaults to false. | string | `false` | no | +| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, + 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's + evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. +Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | +| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors +in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the + API | string | `` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | -| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve +from a triggered state. Defaults to false. | string | `false` | no | | message | Message sent when a monitor is triggered | string | - | yes | -| response_time_appserv_eval_delay | Delay in seconds for the metric evaluation | string | `600` | no | -| response_time_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| response_time_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no | -| response_time_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5 +, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no | +| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it' +s evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. + Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | +| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors + in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via th +e API | string | `` | no | | response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | | response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | -| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve + from a triggered state. Defaults to false. | string | `false` | no | | use_filter_tags | Filter the data with service tags if true | string | `true` | no | Related documentation diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 4f2a693..5f0f2b0 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -3,6 +3,11 @@ variable "environment" { type = "string" } +variable "client_name" { + description = "Client Name" + type = "string" +} + variable "use_filter_tags" { description = "Filter the data with service tags if true" default = "true" @@ -12,15 +17,15 @@ variable "message" { description = "Message sent when a monitor is triggered" } -################################### -### RESPONSE TIME VARIABLES ### -################################### - -variable "response_time_appserv_eval_delay" { +variable "delay" { description = "Delay in seconds for the metric evaluation" default = 600 } +################################### +### RESPONSE TIME VARIABLES ### +################################### + variable "response_time_threshold_critical" { default = 0.8 description = "Alerting threshold in seconds" @@ -51,34 +56,15 @@ variable "response_time_timeout_h" { description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } -variable "response_time_include_tags" { - default = false - description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." -} - # variable "response_time_notify_no_data" { # default = true # description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." # } -variable "response_time_renotify_interval" { - default = 0 - description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." -} - -variable "response_time_escalation_message" { - default = "Escalation message @pagerduty" - description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." -} - ################################### ### MEMORY USAGE VARIABLES ### ################################### -variable "memory_usage_appserv_eval_delay" { - default = 600 -} - variable "memory_usage_threshold_critical" { default = 52430000 description = "Alerting threshold in Mib" @@ -109,26 +95,11 @@ variable "memory_usage_timeout_h" { description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } -variable "memory_usage_include_tags" { - default = false - description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." -} - # variable "memory_usage_notify_no_data" { # default = true # description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." # } -variable "memory_usage_renotify_interval" { - default = 0 - description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." -} - -variable "memory_usage_escalation_message" { - default = "Escalation message @pagerduty" - description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." -} - ################################# ### HTTP 404 status pages ### ################################# @@ -137,10 +108,6 @@ variable "http_404_errors_count_rate_limit" { default = 30 } -variable "http_404_errors_count_rate_appserv_eval_delay" { - default = 600 -} - variable "http_404_errors_count_rate_threshold_critical" { default = 30 description = "Alerting threshold (number of requests)" @@ -171,26 +138,11 @@ variable "http_404_errors_count_rate_timeout_h" { description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } -variable "http_404_errors_count_rate_include_tags" { - default = false - description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." -} - # variable "http_404_errors_count_rate_notify_no_data" { # default = true # description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." # } -variable "http_404_errors_count_rate_renotify_interval" { - default = 0 - description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." -} - -variable "http_404_errors_count_rate_escalation_message" { - default = "Escalation message @pagerduty" - description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." -} - ################################# ### HTTP 202 status pages ### ################################# @@ -199,10 +151,6 @@ variable "http_2xx_status_rate_limit" { default = 30 } -variable "http_2xx_status_rate_appserv_eval_delay" { - default = 600 -} - variable "http_2xx_status_rate_threshold_critical" { default = 0.9 description = "Alerting threshold (percentage)" @@ -233,22 +181,8 @@ variable "http_2xx_status_rate_timeout_h" { description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } -variable "http_2xx_status_rate_include_tags" { - default = false - description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." -} - # variable "http_2xx_status_rate_notify_no_data" { # default = true # description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." # } -variable "http_2xx_status_rate_renotify_interval" { - default = 0 - description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." -} - -variable "http_2xx_status_rate_escalation_message" { - default = "Escalation message @pagerduty" - description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." -} diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index c42ad6c..437b7fb 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -8,10 +8,9 @@ data "template_file" "filter" { # Monitoring App Services response time resource "datadog_monitor" "appservices_response_time" { - name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" - type = "query alert" - message = "${var.message}" - escalation_message = "${var.response_time_escalation_message}" + name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" + type = "query alert" + message = "${var.message}" query = <= ${var.response_time_threshold_critical} EOF - evaluation_delay = "${var.response_time_appserv_eval_delay}" + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" thresholds { warning = "${var.response_time_threshold_warning}" @@ -27,21 +27,20 @@ resource "datadog_monitor" "appservices_response_time" { } notify_no_data = true # Will notify when no data is received - renotify_interval = "${var.response_time_renotify_interval}" + renotify_interval = 0 require_full_window = "${var.response_time_require_full_window}" timeout_h = "${var.response_time_timeout_h}" - include_tags = "${var.response_time_include_tags}" + include_tags = true tags = "${var.response_time_tags}" } # Monitoring App Services memory usage resource "datadog_monitor" "appservices_memory_usage_count" { - name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" - type = "query alert" - message = "${var.message}" - escalation_message = "${var.memory_usage_escalation_message}" + name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" + type = "query alert" + message = "${var.message}" query = <= ${var.memory_usage_threshold_critical} EOF - evaluation_delay = "${var.memory_usage_appserv_eval_delay}" + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" thresholds { warning = "${var.memory_usage_threshold_warning}" @@ -57,21 +57,20 @@ resource "datadog_monitor" "appservices_memory_usage_count" { } notify_no_data = true # Will notify when no data is received - renotify_interval = "${var.memory_usage_renotify_interval}" + renotify_interval = 0 require_full_window = "${var.memory_usage_require_full_window}" timeout_h = "${var.memory_usage_timeout_h}" - include_tags = "${var.memory_usage_include_tags}" + include_tags = true tags = "${var.memory_usage_tags}" } # Monitoring App Services 404 errors rate resource "datadog_monitor" "appservices_http_404_errors_count" { - name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit" - type = "query alert" - message = "${var.message}" - escalation_message = "${var.http_404_errors_count_rate_escalation_message}" + name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit" + type = "query alert" + message = "${var.message}" query = < ${var.http_404_errors_count_rate_threshold_critical} EOF - evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}" + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" thresholds { warning = "${var.http_404_errors_count_rate_threshold_warning}" critical = "${var.http_404_errors_count_rate_threshold_critical}" } - notify_no_data = false # Will NOT notify when no data is received - renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}" + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = 0 require_full_window = true timeout_h = "${var.http_404_errors_count_rate_timeout_h}" - include_tags = "${var.http_404_errors_count_rate_include_tags}" + include_tags = true tags = "${var.http_404_errors_count_rate_tags}" } # Monitoring App Services HTTP 2xx status pages rate resource "datadog_monitor" "appservices_http_2xx_status_rate" { - name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests" - type = "query alert" - message = "${var.message}" - escalation_message = "${var.http_2xx_status_rate_escalation_message}" + name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests" + type = "query alert" + message = "${var.message}" - query = <