From 012d16b77a7dddef1b66176295fe075ef516c01d Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Thu, 23 Nov 2017 17:52:01 +0100 Subject: [PATCH] MON-74 Normalize monitors --- cloud/azure/app-services/README.md | 54 +-------- cloud/azure/app-services/inputs.tf | 113 +----------------- .../app-services/monitors-app_services.tf | 63 +++++----- 3 files changed, 40 insertions(+), 190 deletions(-) diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index 90f5882..e56fac2 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -8,10 +8,8 @@ How to use this module module "datadog-monitors-azure-app-services" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/app-services?ref={revision}" - message = "${module.datadog-message-alerting.alerting-message}" - + message = "${module.datadog-message-alerting.alerting-message}" environment = "${var.environment}" - client_name = "${var.client_name}" } ``` @@ -28,65 +26,23 @@ Creates a DataDog monitors with the following checks : Inputs ------ -| Name | Description | Type | Default | Required | DESKTOP-0PBDRFR: ~ -|------|-------------|:----:|:-----:|:-----:| → -| client_name | Client Name | string | - | yes | +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_# -m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | http_2xx_status_rate_limit | | string | `30` | no | -| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data bef -ore it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be s -kipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` -| no | -| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter m -onitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying - via the API | string | `` | no | | http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | | http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | -| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically -resolve from a triggered state. Defaults to false. | string | `false` | no | -| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write -last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | | http_404_errors_count_rate_limit | | string | `30` | no | -| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of da -ta before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations wil -l be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | ` -true` | no | -| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and fi -lter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when qu -erying via the API | string | `` | no | | http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | | http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | -| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automati -cally resolve from a triggered state. Defaults to false. | string | `false` | no | -| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, - 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | -| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's - evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. -Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors -in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the - API | string | `` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | -| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve -from a triggered state. Defaults to false. | string | `false` | no | | message | Message sent when a monitor is triggered | string | - | yes | -| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5 -, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no | -| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it' -s evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. - Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors - in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via th -e API | string | `` | no | | response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | | response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | -| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve - from a triggered state. Defaults to false. | string | `false` | no | -| use_filter_tags | Filter the data with service tags if true | string | `true` | no | Related documentation --------------------- diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 5f0f2b0..c4bc451 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -3,14 +3,14 @@ variable "environment" { type = "string" } -variable "client_name" { - description = "Client Name" - type = "string" +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" } -variable "use_filter_tags" { - description = "Filter the data with service tags if true" - default = "true" +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" } variable "message" { @@ -36,31 +36,6 @@ variable "response_time_threshold_warning" { description = "Warning threshold in seconds" } -variable "response_time_last_time_window_code" { - default = "1h" - description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" -} - -variable "response_time_require_full_window" { - default = false - description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." -} - -variable "response_time_tags" { - default = [] - description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" -} - -variable "response_time_timeout_h" { - default = false - description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." -} - -# variable "response_time_notify_no_data" { -# default = true -# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -# } - ################################### ### MEMORY USAGE VARIABLES ### ################################### @@ -75,31 +50,6 @@ variable "memory_usage_threshold_warning" { description = "Warning threshold in MiB" } -variable "memory_usage_last_time_window_code" { - default = "5m" - description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" -} - -variable "memory_usage_require_full_window" { - default = false - description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." -} - -variable "memory_usage_tags" { - default = [] - description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" -} - -variable "memory_usage_timeout_h" { - default = false - description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." -} - -# variable "memory_usage_notify_no_data" { -# default = true -# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -# } - ################################# ### HTTP 404 status pages ### ################################# @@ -118,31 +68,6 @@ variable "http_404_errors_count_rate_threshold_warning" { description = "Warning threshold (number of requests)" } -variable "http_404_errors_count_rate_last_time_window_code" { - default = "5m" - description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" -} - -variable "http_404_errors_count_rate_require_full_window" { - default = true - description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." -} - -variable "http_404_errors_count_rate_tags" { - default = [] - description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" -} - -variable "http_404_errors_count_rate_timeout_h" { - default = false - description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." -} - -# variable "http_404_errors_count_rate_notify_no_data" { -# default = true -# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -# } - ################################# ### HTTP 202 status pages ### ################################# @@ -160,29 +85,3 @@ variable "http_2xx_status_rate_threshold_warning" { default = 0.95 description = "Warning threshold (percentage)" } - -variable "http_2xx_status_rate_last_time_window_code" { - default = "5m" - description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" -} - -variable "http_2xx_status_rate_require_full_window" { - default = true - description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." -} - -variable "http_2xx_status_rate_tags" { - default = [] - description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" -} - -variable "http_2xx_status_rate_timeout_h" { - default = false - description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." -} - -# variable "http_2xx_status_rate_notify_no_data" { -# default = true -# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -# } - diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 437b7fb..1cff1af 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -2,18 +2,18 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_appservices:enabled,env:%s", var.environment) : "*"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } # Monitoring App Services response time resource "datadog_monitor" "appservices_response_time" { - name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" - type = "query alert" + name = "[${var.environment}] App Services response time > ${var.response_time_threshold_critical}s on {{name}}" + type = "metric alert" message = "${var.message}" query = <= ${var.response_time_threshold_critical} EOF @@ -26,24 +26,23 @@ resource "datadog_monitor" "appservices_response_time" { critical = "${var.response_time_threshold_critical}" } - notify_no_data = true # Will notify when no data is received + notify_no_data = true # Will notify when no data is received renotify_interval = 0 - require_full_window = "${var.response_time_require_full_window}" + require_full_window = true + timeout_h = 0 + include_tags = true - timeout_h = "${var.response_time_timeout_h}" - include_tags = true - - tags = "${var.response_time_tags}" + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } # Monitoring App Services memory usage resource "datadog_monitor" "appservices_memory_usage_count" { - name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" - type = "query alert" + name = "[${var.environment}] App Services memory usage > ${ceil(var.memory_usage_threshold_critical/1000000)}MiB on {{name}}" + type = "metric alert" message = "${var.message}" query = <= ${var.memory_usage_threshold_critical} EOF @@ -58,22 +57,21 @@ resource "datadog_monitor" "appservices_memory_usage_count" { notify_no_data = true # Will notify when no data is received renotify_interval = 0 - require_full_window = "${var.memory_usage_require_full_window}" + require_full_window = true + timeout_h = 0 + include_tags = true - timeout_h = "${var.memory_usage_timeout_h}" - include_tags = true - - tags = "${var.memory_usage_tags}" + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } # Monitoring App Services 404 errors rate resource "datadog_monitor" "appservices_http_404_errors_count" { - name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit" - type = "query alert" + name = "[${var.environment}] App Services HTTP errors > ${var.http_404_errors_count_rate_limit} limit on {{name}}" + type = "metric alert" message = "${var.message}" query = < ${var.http_404_errors_count_rate_threshold_critical} EOF @@ -89,21 +87,20 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { notify_no_data = false # Will NOT notify when no data is received renotify_interval = 0 require_full_window = true + timeout_h = 0 + include_tags = true - timeout_h = "${var.http_404_errors_count_rate_timeout_h}" - include_tags = true - - tags = "${var.http_404_errors_count_rate_tags}" + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } # Monitoring App Services HTTP 2xx status pages rate resource "datadog_monitor" "appservices_http_2xx_status_rate" { - name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests" - type = "query alert" + name = "[${var.environment}] App Services Too much non 2xx HTTP status in response to the requests on {{name}}" + type = "metric alert" message = "${var.message}" query = <