From 8f62260c1bb4094bfb34a2fdc1dbda1e8e7cfb4d Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 3 Jul 2018 16:15:40 +0200 Subject: [PATCH 01/18] MON-237 - First cosmosdb monitors added --- cloud/azure/README.md | 8 ++ cloud/azure/cosmosdb/README.md | 49 +++++++++ cloud/azure/cosmosdb/inputs.tf | 81 +++++++++++++++ cloud/azure/cosmosdb/monitors-cosmosdb.tf | 115 ++++++++++++++++++++++ cloud/azure/inputs.tf | 41 ++++++++ cloud/azure/monitors.tf | 22 +++++ 6 files changed, 316 insertions(+) create mode 100644 cloud/azure/cosmosdb/README.md create mode 100644 cloud/azure/cosmosdb/inputs.tf create mode 100644 cloud/azure/cosmosdb/monitors-cosmosdb.tf diff --git a/cloud/azure/README.md b/cloud/azure/README.md index 52c33aa..5e67d56 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -83,6 +83,14 @@ Inputs | appservices_response_time_threshold_warning | Warning threshold for response time in seconds | string | `5` | no | | appservices_response_time_time_aggregator | Monitor aggregator for App Services response time [available values: min, max or avg] | string | `min` | no | | appservices_response_time_timeframe | Monitor timeframe for App Services response time [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | +| cosmos_db_4xx_request_rate_threshold_critical | | string | `80` | no | +| cosmos_db_4xx_request_rate_threshold_warning | | string | `50` | no | +| cosmos_db_4xx_requests_message | Custom message for Cosmos DB 4xx requests monitor | string | `` | no | +| cosmos_db_4xx_requests_silenced | Groups to mute for Cosmos DB 4xx requests monitor | map | `` | no | +| cosmos_db_5xx_request_rate_threshold_critical | | string | `80` | no | +| cosmos_db_5xx_request_rate_threshold_warning | | string | `50` | no | +| cosmos_db_5xx_requests_message | Custom message for Cosmos DB 5xx requests monitor | string | `` | no | +| cosmos_db_5xx_requests_silenced | Groups to mute for Cosmos DB 5xx requests monitor | map | `` | no | | delay | Delay in seconds for the metric evaluation | string | `900` | no | | environment | Architecture environment | string | - | yes | | eventhub_errors_rate_message | Custom message for Event Hub errors monitor | string | `` | no | diff --git a/cloud/azure/cosmosdb/README.md b/cloud/azure/cosmosdb/README.md new file mode 100644 index 0000000..1df255e --- /dev/null +++ b/cloud/azure/cosmosdb/README.md @@ -0,0 +1,49 @@ +Azure Cosmos DB DataDog monitors +================================ + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-cosmosdb" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/cosmosdb?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* No request +* Too many 4xx requests +* Too many 5xx requests + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| cosmos_db_4xx_request_rate_threshold_critical | | string | `80` | no | +| cosmos_db_4xx_request_rate_threshold_warning | | string | `50` | no | +| cosmos_db_4xx_requests_message | Custom message for Cosmos DB 4xx requests monitor | string | `` | no | +| cosmos_db_4xx_requests_silenced | Groups to mute for Cosmos DB 4xx requests monitor | map | `` | no | +| cosmos_db_5xx_request_rate_threshold_critical | | string | `80` | no | +| cosmos_db_5xx_request_rate_threshold_warning | | string | `50` | no | +| cosmos_db_5xx_requests_message | Custom message for Cosmos DB 5xx requests monitor | string | `` | no | +| cosmos_db_5xx_requests_silenced | Groups to mute for Cosmos DB 5xx requests monitor | map | `` | no | +| cosmos_db_no_request_message | Custom message for Cosmos DB no request monitor | string | `` | no | +| cosmos_db_no_request_silenced | Groups to mute for Cosmos DB no request monitor | map | `` | no | +| delay | Delay in seconds for the metric evaluation | string | `900` | no | +| environment | Architecture environment | string | - | yes | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| subscription_id | ID of the subscription | string | - | yes | + +Related documentation +--------------------- + +To be defined diff --git a/cloud/azure/cosmosdb/inputs.tf b/cloud/azure/cosmosdb/inputs.tf new file mode 100644 index 0000000..9232eed --- /dev/null +++ b/cloud/azure/cosmosdb/inputs.tf @@ -0,0 +1,81 @@ +variable "environment" { + description = "Architecture environment" + type = "string" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +variable "subscription_id" { + description = "ID of the subscription" + type = "string" +} + +# Azure CosmosDB specific variables +variable "cosmos_db_4xx_requests_message" { + description = "Custom message for Cosmos DB 4xx requests monitor" + type = "string" + default = "" +} + +variable "cosmos_db_4xx_requests_silenced" { + description = "Groups to mute for Cosmos DB 4xx requests monitor" + type = "map" + default = {} +} + +variable "cosmos_db_4xx_request_rate_threshold_critical" { + default = 80 +} + +variable "cosmos_db_4xx_request_rate_threshold_warning" { + default = 50 +} + +variable "cosmos_db_5xx_requests_message" { + description = "Custom message for Cosmos DB 5xx requests monitor" + type = "string" + default = "" +} + +variable "cosmos_db_5xx_requests_silenced" { + description = "Groups to mute for Cosmos DB 5xx requests monitor" + type = "map" + default = {} +} + +variable "cosmos_db_5xx_request_rate_threshold_critical" { + default = 80 +} + +variable "cosmos_db_5xx_request_rate_threshold_warning" { + default = 50 +} + +variable "cosmos_db_no_request_message" { + description = "Custom message for Cosmos DB no request monitor" + type = "string" + default = "" +} + +variable "cosmos_db_no_request_silenced" { + description = "Groups to mute for Cosmos DB no request monitor" + type = "map" + default = {} +} diff --git a/cloud/azure/cosmosdb/monitors-cosmosdb.tf b/cloud/azure/cosmosdb/monitors-cosmosdb.tf new file mode 100644 index 0000000..95fa7bb --- /dev/null +++ b/cloud/azure/cosmosdb/monitors-cosmosdb.tf @@ -0,0 +1,115 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("subscription_id:%s", var.subscription_id) : "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "cosmos_db_4xx_requests" { + name = "[${var.environment}] Cosmos DB 4xx requests rate is too low {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.cosmos_db_4xx_requests_message, var.message)}" + + query = < ${var.cosmos_db_4xx_request_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + critical = "${var.cosmos_db_4xx_request_rate_threshold_critical}" + warning = "${var.cosmos_db_4xx_request_rate_threshold_warning}" + } + + silenced = "${var.cosmos_db_4xx_requests_silenced}" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:cosmos_db", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "cosmos_db_5xx_requests" { + name = "[${var.environment}] Cosmos DB 5xx requests rate is too low {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.cosmos_db_5xx_requests_message, var.message)}" + + query = < ${var.cosmos_db_5xx_request_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + critical = "${var.cosmos_db_5xx_request_rate_threshold_critical}" + warning = "${var.cosmos_db_5xx_request_rate_threshold_warning}" + } + + silenced = "${var.cosmos_db_5xx_requests_silenced}" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:cosmos_db", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "cosmos_db_success_no_data" { + name = "[${var.environment}] Cosmos DB has no request" + message = "${coalesce(var.cosmos_db_no_request_message, var.message)}" + + query = < Date: Wed, 4 Jul 2018 17:42:52 +0200 Subject: [PATCH 02/18] MON-237 - Cosmos RU utilization monitor added --- cloud/azure/README.md | 15 ++- cloud/azure/cosmosdb/README.md | 120 +++++++++++++--------- cloud/azure/cosmosdb/inputs.tf | 39 ++++++- cloud/azure/cosmosdb/monitors-cosmosdb.tf | 42 +++++++- cloud/azure/inputs.tf | 51 ++++++++- cloud/azure/monitors.tf | 6 ++ 6 files changed, 207 insertions(+), 66 deletions(-) diff --git a/cloud/azure/README.md b/cloud/azure/README.md index 5e67d56..e6785e7 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -83,14 +83,21 @@ Inputs | appservices_response_time_threshold_warning | Warning threshold for response time in seconds | string | `5` | no | | appservices_response_time_time_aggregator | Monitor aggregator for App Services response time [available values: min, max or avg] | string | `min` | no | | appservices_response_time_timeframe | Monitor timeframe for App Services response time [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | -| cosmos_db_4xx_request_rate_threshold_critical | | string | `80` | no | -| cosmos_db_4xx_request_rate_threshold_warning | | string | `50` | no | +| cosmos_db_4xx_request_rate_threshold_critical | Critical threshold for Cosmos DB 4xx requests monitor | string | `80` | no | +| cosmos_db_4xx_request_rate_threshold_warning | Warning threshold for Cosmos DB 4xx requests monitor | string | `50` | no | | cosmos_db_4xx_requests_message | Custom message for Cosmos DB 4xx requests monitor | string | `` | no | | cosmos_db_4xx_requests_silenced | Groups to mute for Cosmos DB 4xx requests monitor | map | `` | no | -| cosmos_db_5xx_request_rate_threshold_critical | | string | `80` | no | -| cosmos_db_5xx_request_rate_threshold_warning | | string | `50` | no | +| cosmos_db_5xx_request_rate_threshold_critical | Critical threshold for Cosmos DB 5xx requests monitor | string | `80` | no | +| cosmos_db_5xx_request_rate_threshold_warning | Warning threshold for Cosmos DB 5xx requests monitor | string | `50` | no | | cosmos_db_5xx_requests_message | Custom message for Cosmos DB 5xx requests monitor | string | `` | no | | cosmos_db_5xx_requests_silenced | Groups to mute for Cosmos DB 5xx requests monitor | map | `` | no | +| cosmos_db_no_request_message | Custom message for Cosmos DB no request monitor | string | `` | no | +| cosmos_db_no_request_silenced | Groups to mute for Cosmos DB no request monitor | map | `` | no | +| cosmos_db_ru_utilization_collection | Group to associate Cosmos DB collection to RU max | map | - | yes | +| cosmos_db_ru_utilization_message | Custom message for Cosmos DB collection RU utilization monitor | string | `` | no | +| cosmos_db_ru_utilization_rate_threshold_critical | Critical threshold for Cosmos DB collection RU utilization monitor | string | `90` | no | +| cosmos_db_ru_utilization_rate_threshold_warning | Warning threshold for Cosmos DB collection RU utilization monitor | string | `80` | no | +| cosmos_db_ru_utilization_silenced | Groups to mute for Cosmos DB collection RU utilization monitor | map | `` | no | | delay | Delay in seconds for the metric evaluation | string | `900` | no | | environment | Architecture environment | string | - | yes | | eventhub_errors_rate_message | Custom message for Event Hub errors monitor | string | `` | no | diff --git a/cloud/azure/cosmosdb/README.md b/cloud/azure/cosmosdb/README.md index 1df255e..28a8e26 100644 --- a/cloud/azure/cosmosdb/README.md +++ b/cloud/azure/cosmosdb/README.md @@ -1,49 +1,71 @@ -Azure Cosmos DB DataDog monitors -================================ - -How to use this module ----------------------- - -``` -module "datadog-monitors-azure-cosmosdb" { - source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/cosmosdb?ref={revision}" - - message = "${module.datadog-message-alerting.alerting-message}" - environment = "${var.environment}" -} -``` - -Purpose -------- -Creates a DataDog monitors with the following checks : - -* No request -* Too many 4xx requests -* Too many 5xx requests - -Inputs ------- - -| Name | Description | Type | Default | Required | -|------|-------------|:----:|:-----:|:-----:| -| cosmos_db_4xx_request_rate_threshold_critical | | string | `80` | no | -| cosmos_db_4xx_request_rate_threshold_warning | | string | `50` | no | -| cosmos_db_4xx_requests_message | Custom message for Cosmos DB 4xx requests monitor | string | `` | no | -| cosmos_db_4xx_requests_silenced | Groups to mute for Cosmos DB 4xx requests monitor | map | `` | no | -| cosmos_db_5xx_request_rate_threshold_critical | | string | `80` | no | -| cosmos_db_5xx_request_rate_threshold_warning | | string | `50` | no | -| cosmos_db_5xx_requests_message | Custom message for Cosmos DB 5xx requests monitor | string | `` | no | -| cosmos_db_5xx_requests_silenced | Groups to mute for Cosmos DB 5xx requests monitor | map | `` | no | -| cosmos_db_no_request_message | Custom message for Cosmos DB no request monitor | string | `` | no | -| cosmos_db_no_request_silenced | Groups to mute for Cosmos DB no request monitor | map | `` | no | -| delay | Delay in seconds for the metric evaluation | string | `900` | no | -| environment | Architecture environment | string | - | yes | -| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | -| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | -| message | Message sent when a monitor is triggered | string | - | yes | -| subscription_id | ID of the subscription | string | - | yes | - -Related documentation ---------------------- - -To be defined +Azure Cosmos DB DataDog monitors +================================ + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-cosmosdb" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/cosmosdb?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +How to define cosmos_db_ru_utilization_collection variable +---------------------------------------------------------- + +At the time this module is defined, we can't define Cosmos DB collection with Terraform, so we have to define a variable making the connection between collections and RU max + +``` +variable cosmos_db_ru_utilization_collection { + type = "map" + default = { + "collection_1" = "ru_max_1" + "collection_2" = "ru_max_2" + ... + } +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* No request +* Too many 4xx requests +* Too many 5xx requests +* Collection RU utilization + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| cosmos_db_4xx_request_rate_threshold_critical | Critical threshold for Cosmos DB 4xx requests monitor | string | `80` | no | +| cosmos_db_4xx_request_rate_threshold_warning | Warning threshold for Cosmos DB 4xx requests monitor | string | `50` | no | +| cosmos_db_4xx_requests_message | Custom message for Cosmos DB 4xx requests monitor | string | `` | no | +| cosmos_db_4xx_requests_silenced | Groups to mute for Cosmos DB 4xx requests monitor | map | `` | no | +| cosmos_db_5xx_request_rate_threshold_critical | Critical threshold for Cosmos DB 5xx requests monitor | string | `80` | no | +| cosmos_db_5xx_request_rate_threshold_warning | Warning threshold for Cosmos DB 5xx requests monitor | string | `50` | no | +| cosmos_db_5xx_requests_message | Custom message for Cosmos DB 5xx requests monitor | string | `` | no | +| cosmos_db_5xx_requests_silenced | Groups to mute for Cosmos DB 5xx requests monitor | map | `` | no | +| cosmos_db_no_request_message | Custom message for Cosmos DB no request monitor | string | `` | no | +| cosmos_db_no_request_silenced | Groups to mute for Cosmos DB no request monitor | map | `` | no | +| cosmos_db_ru_utilization_collection | Group to associate Cosmos DB collection to RU max | map | - | yes | +| cosmos_db_ru_utilization_message | Custom message for Cosmos DB collection RU utilization monitor | string | `` | no | +| cosmos_db_ru_utilization_rate_threshold_critical | Critical threshold for Cosmos DB collection RU utilization monitor | string | `90` | no | +| cosmos_db_ru_utilization_rate_threshold_warning | Warning threshold for Cosmos DB collection RU utilization monitor | string | `80` | no | +| cosmos_db_ru_utilization_silenced | Groups to mute for Cosmos DB collection RU utilization monitor | map | `` | no | +| delay | Delay in seconds for the metric evaluation | string | `900` | no | +| environment | Architecture environment | string | - | yes | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| subscription_id | ID of the subscription | string | - | yes | + +Related documentation +--------------------- + +To be defined diff --git a/cloud/azure/cosmosdb/inputs.tf b/cloud/azure/cosmosdb/inputs.tf index 9232eed..40a94cf 100644 --- a/cloud/azure/cosmosdb/inputs.tf +++ b/cloud/azure/cosmosdb/inputs.tf @@ -41,11 +41,13 @@ variable "cosmos_db_4xx_requests_silenced" { } variable "cosmos_db_4xx_request_rate_threshold_critical" { - default = 80 + description = "Critical threshold for Cosmos DB 4xx requests monitor" + default = 80 } variable "cosmos_db_4xx_request_rate_threshold_warning" { - default = 50 + description = "Warning threshold for Cosmos DB 4xx requests monitor" + default = 50 } variable "cosmos_db_5xx_requests_message" { @@ -61,11 +63,13 @@ variable "cosmos_db_5xx_requests_silenced" { } variable "cosmos_db_5xx_request_rate_threshold_critical" { - default = 80 + description = "Critical threshold for Cosmos DB 5xx requests monitor" + default = 80 } variable "cosmos_db_5xx_request_rate_threshold_warning" { - default = 50 + description = "Warning threshold for Cosmos DB 5xx requests monitor" + default = 50 } variable "cosmos_db_no_request_message" { @@ -79,3 +83,30 @@ variable "cosmos_db_no_request_silenced" { type = "map" default = {} } + +variable "cosmos_db_ru_utilization_message" { + description = "Custom message for Cosmos DB collection RU utilization monitor" + type = "string" + default = "" +} + +variable "cosmos_db_ru_utilization_silenced" { + description = "Groups to mute for Cosmos DB collection RU utilization monitor" + type = "map" + default = {} +} + +variable "cosmos_db_ru_utilization_rate_threshold_critical" { + description = "Critical threshold for Cosmos DB collection RU utilization monitor" + default = 90 +} + +variable "cosmos_db_ru_utilization_rate_threshold_warning" { + description = "Warning threshold for Cosmos DB collection RU utilization monitor" + default = 80 +} + +variable "cosmos_db_ru_utilization_collection" { + description = "Group to associate Cosmos DB collection to RU max" + type = "map" +} diff --git a/cloud/azure/cosmosdb/monitors-cosmosdb.tf b/cloud/azure/cosmosdb/monitors-cosmosdb.tf index 95fa7bb..19a21ad 100644 --- a/cloud/azure/cosmosdb/monitors-cosmosdb.tf +++ b/cloud/azure/cosmosdb/monitors-cosmosdb.tf @@ -7,7 +7,7 @@ data "template_file" "filter" { } resource "datadog_monitor" "cosmos_db_4xx_requests" { - name = "[${var.environment}] Cosmos DB 4xx requests rate is too low {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "[${var.environment}] Cosmos DB 4xx requests rate is high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = "${coalesce(var.cosmos_db_4xx_requests_message, var.message)}" query = < ${var.cosmos_db_ru_utilization_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + critical = "${var.cosmos_db_ru_utilization_rate_threshold_critical}" + warning = "${var.cosmos_db_ru_utilization_rate_threshold_warning}" + } + + silenced = "${var.cosmos_db_ru_utilization_silenced}" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + + tags = ["env:${var.environment}", "resource:cosmos_db", "collection:${element(keys(var.cosmos_db_ru_utilization_collection),count.index)}", "team:azure", "provider:azure"] +} diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf index e12a9c9..b477cd1 100644 --- a/cloud/azure/inputs.tf +++ b/cloud/azure/inputs.tf @@ -1532,11 +1532,13 @@ variable "cosmos_db_4xx_requests_silenced" { } variable "cosmos_db_4xx_request_rate_threshold_critical" { - default = 80 + description = "Critical threshold for Cosmos DB 4xx requests monitor" + default = 80 } variable "cosmos_db_4xx_request_rate_threshold_warning" { - default = 50 + description = "Warning threshold for Cosmos DB 4xx requests monitor" + default = 50 } variable "cosmos_db_5xx_requests_message" { @@ -1552,9 +1554,50 @@ variable "cosmos_db_5xx_requests_silenced" { } variable "cosmos_db_5xx_request_rate_threshold_critical" { - default = 80 + description = "Critical threshold for Cosmos DB 5xx requests monitor" + default = 80 } variable "cosmos_db_5xx_request_rate_threshold_warning" { - default = 50 + description = "Warning threshold for Cosmos DB 5xx requests monitor" + default = 50 +} + +variable "cosmos_db_no_request_message" { + description = "Custom message for Cosmos DB no request monitor" + type = "string" + default = "" +} + +variable "cosmos_db_no_request_silenced" { + description = "Groups to mute for Cosmos DB no request monitor" + type = "map" + default = {} +} + +variable "cosmos_db_ru_utilization_message" { + description = "Custom message for Cosmos DB collection RU utilization monitor" + type = "string" + default = "" +} + +variable "cosmos_db_ru_utilization_silenced" { + description = "Groups to mute for Cosmos DB collection RU utilization monitor" + type = "map" + default = {} +} + +variable "cosmos_db_ru_utilization_rate_threshold_critical" { + description = "Critical threshold for Cosmos DB collection RU utilization monitor" + default = 90 +} + +variable "cosmos_db_ru_utilization_rate_threshold_warning" { + description = "Warning threshold for Cosmos DB collection RU utilization monitor" + default = 80 +} + +variable "cosmos_db_ru_utilization_collection" { + description = "Group to associate Cosmos DB collection to RU max" + type = "map" } diff --git a/cloud/azure/monitors.tf b/cloud/azure/monitors.tf index 068ccbe..6738aa6 100644 --- a/cloud/azure/monitors.tf +++ b/cloud/azure/monitors.tf @@ -390,4 +390,10 @@ module "cosmosdb" { cosmos_db_5xx_request_rate_threshold_warning = "${var.cosmos_db_5xx_request_rate_threshold_warning}" cosmos_db_5xx_requests_message = "${var.cosmos_db_5xx_requests_message}" cosmos_db_5xx_requests_silenced = "${var.cosmos_db_5xx_requests_silenced}" + + cosmos_db_ru_utilization_rate_threshold_critical = "${var.cosmos_db_ru_utilization_rate_threshold_critical}" + cosmos_db_ru_utilization_rate_threshold_warning = "${var.cosmos_db_ru_utilization_rate_threshold_warning}" + cosmos_db_ru_utilization_message = "${var.cosmos_db_ru_utilization_message}" + cosmos_db_ru_utilization_silenced = "${var.cosmos_db_ru_utilization_silenced}" + cosmos_db_ru_utilization_collection = "${var.cosmos_db_ru_utilization_collection}" } From 44b5153d6bc46153889f996243237b9764af3463 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Thu, 5 Jul 2018 10:23:50 +0200 Subject: [PATCH 03/18] MON-237 - Cosmos DB module updated --- cloud/azure/cosmosdb/README.md | 6 +++++- cloud/azure/cosmosdb/monitors-cosmosdb.tf | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cloud/azure/cosmosdb/README.md b/cloud/azure/cosmosdb/README.md index 28a8e26..9c051e9 100644 --- a/cloud/azure/cosmosdb/README.md +++ b/cloud/azure/cosmosdb/README.md @@ -68,4 +68,8 @@ Inputs Related documentation --------------------- -To be defined +DataDog documentation : [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/) +You must search `cosmosdb`, there is no integration for now. + +Azure metrics documentation : [https://docs.microsoft.com/fr-fr/azure/monitoring-and-diagnostics/monitoring-supported-metrics#microsoftdocumentdbdatabaseaccounts](https://docs.microsoft.com/fr-fr/azure/monitoring-and-diagnostics/monitoring-supported-metrics#microsoftdocumentdbdatabaseaccounts) + diff --git a/cloud/azure/cosmosdb/monitors-cosmosdb.tf b/cloud/azure/cosmosdb/monitors-cosmosdb.tf index 19a21ad..5237c0a 100644 --- a/cloud/azure/cosmosdb/monitors-cosmosdb.tf +++ b/cloud/azure/cosmosdb/monitors-cosmosdb.tf @@ -2,7 +2,7 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.filter_tags_use_defaults == "true" ? format("subscription_id:%s", var.subscription_id) : "${var.filter_tags_custom}"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_cosmosdb:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } From 09dd8a82542eb38dbe1b0ccb882f9b426d5b307e Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Thu, 5 Jul 2018 10:24:16 +0200 Subject: [PATCH 04/18] MON-237 - Service Bus monitors updated --- cloud/azure/README.md | 14 +++ cloud/azure/cosmosdb/inputs.tf | 5 - cloud/azure/cosmosdb/monitors-cosmosdb.tf | 32 +++---- cloud/azure/inputs.tf | 80 ++++++++++++++++ cloud/azure/monitors.tf | 17 ++++ cloud/azure/servicebus/README.md | 19 +++- cloud/azure/servicebus/inputs.tf | 80 ++++++++++++++++ .../azure/servicebus/monitors-service-bus.tf | 95 +++++++++++++++++++ 8 files changed, 320 insertions(+), 22 deletions(-) diff --git a/cloud/azure/README.md b/cloud/azure/README.md index e6785e7..fd33024 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -206,10 +206,24 @@ Inputs | redis_status_silenced | Groups to mute for Redis status monitor | map | `` | no | | redis_status_time_aggregator | Monitor aggregator for Redis status [available values: min, max or avg] | string | `max` | no | | redis_status_timeframe | Monitor timeframe for Redis status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | +| servicebus_no_active_connections_message | Custom message for Service Bus status monitor | string | `` | no | +| servicebus_no_active_connections_silenced | Groups to mute for Service Bus status monitor | map | `` | no | +| servicebus_no_active_connections_time_aggregator | Monitor aggregator for Service Bus status [available values: min, max or avg] | string | `max` | no | +| servicebus_no_active_connections_timeframe | Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| servicebus_server_errors_message | Custom message for Service Bus server errors monitor | string | `` | no | +| servicebus_server_errors_silenced | Groups to mute for Service Bus server errors monitor | map | `` | no | +| servicebus_server_errors_threshold_critical | Critical threshold for Service Bus server errors monitor | string | `90` | no | +| servicebus_server_errors_threshold_warning | Warning threshold for Service Bus server errors monitor | string | `50` | no | +| servicebus_server_errors_timeframe | Monitor timeframe for Service Bus server errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | servicebus_status_message | Custom message for Service Bus status monitor | string | `` | no | | servicebus_status_silenced | Groups to mute for Service Bus status monitor | map | `` | no | | servicebus_status_time_aggregator | Monitor aggregator for Service Bus status [available values: min, max or avg] | string | `max` | no | | servicebus_status_timeframe | Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| servicebus_user_errors_message | Custom message for Service Bus user errors monitor | string | `` | no | +| servicebus_user_errors_silenced | Groups to mute for Service Bus user errors monitor | map | `` | no | +| servicebus_user_errors_threshold_critical | Critical threshold for Service Bus user errors monitor | string | `90` | no | +| servicebus_user_errors_threshold_warning | Warning threshold for Service Bus user errors monitor | string | `50` | no | +| servicebus_user_errors_timeframe | Monitor timeframe for Service Bus user errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | sqldatabase_cpu_message | Custom message for SQL CPU monitor | string | `` | no | | sqldatabase_cpu_silenced | Groups to mute for SQL CPU monitor | map | `` | no | | sqldatabase_cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | diff --git a/cloud/azure/cosmosdb/inputs.tf b/cloud/azure/cosmosdb/inputs.tf index 40a94cf..e8d04bf 100644 --- a/cloud/azure/cosmosdb/inputs.tf +++ b/cloud/azure/cosmosdb/inputs.tf @@ -22,11 +22,6 @@ variable "delay" { default = 900 } -variable "subscription_id" { - description = "ID of the subscription" - type = "string" -} - # Azure CosmosDB specific variables variable "cosmos_db_4xx_requests_message" { description = "Custom message for Cosmos DB 4xx requests monitor" diff --git a/cloud/azure/cosmosdb/monitors-cosmosdb.tf b/cloud/azure/cosmosdb/monitors-cosmosdb.tf index 5237c0a..fb4b916 100644 --- a/cloud/azure/cosmosdb/monitors-cosmosdb.tf +++ b/cloud/azure/cosmosdb/monitors-cosmosdb.tf @@ -12,17 +12,17 @@ resource "datadog_monitor" "cosmos_db_4xx_requests" { query = < ${var.cosmos_db_4xx_request_rate_threshold_critical} EOF @@ -55,9 +55,9 @@ resource "datadog_monitor" "cosmos_db_5xx_requests" { query = < ${var.cosmos_db_5xx_request_rate_threshold_critical} EOF @@ -90,7 +90,7 @@ resource "datadog_monitor" "cosmos_db_success_no_data" { query = < ${var.cosmos_db_ru_utilization_rate_threshold_critical} EOF diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf index b477cd1..8227fa2 100644 --- a/cloud/azure/inputs.tf +++ b/cloud/azure/inputs.tf @@ -930,6 +930,86 @@ variable "servicebus_status_timeframe" { default = "last_15m" } +variable "servicebus_no_active_connections_silenced" { + description = "Groups to mute for Service Bus status monitor" + type = "map" + default = {} +} + +variable "servicebus_no_active_connections_message" { + description = "Custom message for Service Bus status monitor" + type = "string" + default = "" +} + +variable "servicebus_no_active_connections_time_aggregator" { + description = "Monitor aggregator for Service Bus status [available values: min, max or avg]" + type = "string" + default = "max" +} + +variable "servicebus_no_active_connections_timeframe" { + description = "Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_15m" +} + +variable "servicebus_server_errors_message" { + description = "Custom message for Service Bus server errors monitor" + type = "string" + default = "" +} + +variable "servicebus_server_errors_silenced" { + description = "Groups to mute for Service Bus server errors monitor" + type = "map" + default = {} +} + +variable "servicebus_server_errors_timeframe" { + description = "Monitor timeframe for Service Bus server errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "servicebus_server_errors_threshold_critical" { + description = "Critical threshold for Service Bus server errors monitor" + default = 90 +} + +variable "servicebus_server_errors_threshold_warning" { + description = "Warning threshold for Service Bus server errors monitor" + default = 50 +} + +variable "servicebus_user_errors_message" { + description = "Custom message for Service Bus user errors monitor" + type = "string" + default = "" +} + +variable "servicebus_user_errors_silenced" { + description = "Groups to mute for Service Bus user errors monitor" + type = "map" + default = {} +} + +variable "servicebus_user_errors_timeframe" { + description = "Monitor timeframe for Service Bus user errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "servicebus_user_errors_threshold_critical" { + description = "Critical threshold for Service Bus user errors monitor" + default = 90 +} + +variable "servicebus_user_errors_threshold_warning" { + description = "Warning threshold for Service Bus user errors monitor" + default = 50 +} + # Azure SQL Database specific variables variable "sqldatabase_cpu_silenced" { description = "Groups to mute for SQL CPU monitor" diff --git a/cloud/azure/monitors.tf b/cloud/azure/monitors.tf index 6738aa6..0716157 100644 --- a/cloud/azure/monitors.tf +++ b/cloud/azure/monitors.tf @@ -230,6 +230,23 @@ module "servicebus" { status_message = "${var.servicebus_status_message}" status_timeframe = "${var.servicebus_status_timeframe}" status_time_aggregator = "${var.servicebus_status_time_aggregator}" + + no_active_connections_silenced = "${var.servicebus_no_active_connections_silenced}" + no_active_connections_message = "${var.servicebus_no_active_connections_message}" + no_active_connections_timeframe = "${var.servicebus_no_active_connections_timeframe}" + no_active_connections_time_aggregator = "${var.servicebus_no_active_connections_time_aggregator}" + + server_errors_silenced = "${var.servicebus_server_errors_silenced}" + server_errors_message = "${var.servicebus_server_errors_message}" + server_errors_timeframe = "${var.servicebus_server_errors_timeframe}" + server_errors_threshold_critical = "${var.servicebus_server_errors_threshold_critical}" + server_errors_threshold_warning = "${var.servicebus_server_errors_threshold_warning}" + + user_errors_silenced = "${var.servicebus_user_errors_silenced}" + user_errors_message = "${var.servicebus_user_errors_message}" + user_errors_timeframe = "${var.servicebus_user_errors_timeframe}" + user_errors_threshold_critical = "${var.servicebus_user_errors_threshold_critical}" + user_errors_threshold_warning = "${var.servicebus_user_errors_threshold_warning}" } module "sqldatabase" { diff --git a/cloud/azure/servicebus/README.md b/cloud/azure/servicebus/README.md index 052aab1..8c3fa7e 100644 --- a/cloud/azure/servicebus/README.md +++ b/cloud/azure/servicebus/README.md @@ -16,7 +16,10 @@ module "datadog-monitors-cloud-azure-servicebus" { Creates DataDog monitors with the following checks: -- Service Bus is down +- Service status check +- No active connection +- Server errors rate +- User errors rate ## Inputs @@ -27,12 +30,26 @@ Creates DataDog monitors with the following checks: | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when an alert is triggered | string | - | yes | +| no_active_connections_message | Custom message for Service Bus status monitor | string | `` | no | +| no_active_connections_silenced | Groups to mute for Service Bus status monitor | map | `` | no | +| no_active_connections_time_aggregator | Monitor aggregator for Service Bus status [available values: min, max or avg] | string | `max` | no | +| no_active_connections_timeframe | Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| server_errors_message | Custom message for Service Bus server errors monitor | string | `` | no | +| server_errors_silenced | Groups to mute for Service Bus server errors monitor | map | `` | no | +| server_errors_threshold_critical | Critical threshold for Service Bus server errors monitor | string | `90` | no | +| server_errors_threshold_warning | Warning threshold for Service Bus server errors monitor | string | `50` | no | +| server_errors_timeframe | Monitor timeframe for Service Bus server errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | new_host_delay | Delay in seconds before monitor new resource | string | `300` | no | | status_extra_tags | Extra tags for Service Bus status monitor | list | `` | no | | status_message | Custom message for Service Bus status monitor | string | `` | no | | status_silenced | Groups to mute for Service Bus status monitor | map | `` | no | | status_time_aggregator | Monitor aggregator for Service Bus status [available values: min, max or avg] | string | `max` | no | | status_timeframe | Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| user_errors_message | Custom message for Service Bus user errors monitor | string | `` | no | +| user_errors_silenced | Groups to mute for Service Bus user errors monitor | map | `` | no | +| user_errors_threshold_critical | Critical threshold for Service Bus user errors monitor | string | `90` | no | +| user_errors_threshold_warning | Warning threshold for Service Bus user errors monitor | string | `50` | no | +| user_errors_timeframe | Monitor timeframe for Service Bus user errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | ## Outputs diff --git a/cloud/azure/servicebus/inputs.tf b/cloud/azure/servicebus/inputs.tf index 680b606..ca8bbc4 100644 --- a/cloud/azure/servicebus/inputs.tf +++ b/cloud/azure/servicebus/inputs.tf @@ -58,3 +58,83 @@ variable "status_timeframe" { description = "Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" default = "last_15m" } + +variable "no_active_connections_silenced" { + description = "Groups to mute for Service Bus status monitor" + type = "map" + default = {} +} + +variable "no_active_connections_message" { + description = "Custom message for Service Bus status monitor" + type = "string" + default = "" +} + +variable "no_active_connections_time_aggregator" { + description = "Monitor aggregator for Service Bus status [available values: min, max or avg]" + type = "string" + default = "max" +} + +variable "no_active_connections_timeframe" { + description = "Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_15m" +} + +variable "server_errors_message" { + description = "Custom message for Service Bus server errors monitor" + type = "string" + default = "" +} + +variable "server_errors_silenced" { + description = "Groups to mute for Service Bus server errors monitor" + type = "map" + default = {} +} + +variable "server_errors_timeframe" { + description = "Monitor timeframe for Service Bus server errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "server_errors_threshold_critical" { + description = "Critical threshold for Service Bus server errors monitor" + default = 90 +} + +variable "server_errors_threshold_warning" { + description = "Warning threshold for Service Bus server errors monitor" + default = 50 +} + +variable "user_errors_message" { + description = "Custom message for Service Bus user errors monitor" + type = "string" + default = "" +} + +variable "user_errors_silenced" { + description = "Groups to mute for Service Bus user errors monitor" + type = "map" + default = {} +} + +variable "user_errors_timeframe" { + description = "Monitor timeframe for Service Bus user errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "user_errors_threshold_critical" { + description = "Critical threshold for Service Bus user errors monitor" + default = 90 +} + +variable "user_errors_threshold_warning" { + description = "Warning threshold for Service Bus user errors monitor" + default = 50 +} diff --git a/cloud/azure/servicebus/monitors-service-bus.tf b/cloud/azure/servicebus/monitors-service-bus.tf index dc9d1ea..79acaf2 100644 --- a/cloud/azure/servicebus/monitors-service-bus.tf +++ b/cloud/azure/servicebus/monitors-service-bus.tf @@ -24,3 +24,98 @@ EOF tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:servicebus", "team:claranet", "created-by:terraform", "${var.status_extra_tags}"] } + +resource "datadog_monitor" "service_bus_no_active_connections" { + name = "[${var.environment}] Service Bus has no active connection" + message = "${coalesce(var.no_active_connections_message, var.message)}" + + query = < ${var.user_errors_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + critical = "${var.user_errors_threshold_critical}" + warning = "${var.user_errors_threshold_warning}" + } + + silenced = "${var.user_errors_silenced}" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + + tags = ["env:${var.environment}", "resource:servicebus", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "service_bus_server_errors" { + name = "[${var.environment}] Service Bus server errors rate is high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.server_errors_message, var.message)}" + + query = < ${var.server_errors_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + critical = "${var.server_errors_threshold_critical}" + warning = "${var.server_errors_threshold_warning}" + } + + silenced = "${var.server_errors_silenced}" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + + tags = ["env:${var.environment}", "resource:servicebus", "team:azure", "provider:azure"] +} From c8f5e0e03e7a60c08a044e036f044b4eea468e4d Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Thu, 5 Jul 2018 17:17:59 +0200 Subject: [PATCH 05/18] MON-237 - Datalake store monitors added --- cloud/azure/README.md | 4 ++ cloud/azure/cosmosdb/README.md | 1 - cloud/azure/datalakestore/README.md | 43 +++++++++++++++++ cloud/azure/datalakestore/inputs.tf | 47 +++++++++++++++++++ .../datalakestore/monitors-datalakestore.tf | 34 ++++++++++++++ cloud/azure/inputs.tf | 24 ++++++++++ cloud/azure/monitors.tf | 23 +++++++-- 7 files changed, 171 insertions(+), 5 deletions(-) create mode 100644 cloud/azure/datalakestore/README.md create mode 100644 cloud/azure/datalakestore/inputs.tf create mode 100644 cloud/azure/datalakestore/monitors-datalakestore.tf diff --git a/cloud/azure/README.md b/cloud/azure/README.md index fd33024..bbdafc7 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -98,6 +98,10 @@ Inputs | cosmos_db_ru_utilization_rate_threshold_critical | Critical threshold for Cosmos DB collection RU utilization monitor | string | `90` | no | | cosmos_db_ru_utilization_rate_threshold_warning | Warning threshold for Cosmos DB collection RU utilization monitor | string | `80` | no | | cosmos_db_ru_utilization_silenced | Groups to mute for Cosmos DB collection RU utilization monitor | map | `` | no | +| datalakestore_status_message | Custom message for Datalake Store status monitor | string | `` | no | +| datalakestore_status_silenced | Groups to mute for Datalake Store status monitor | map | `` | no | +| datalakestore_status_time_aggregator | Monitor aggregator for Datalake Store status [available values: min, max or avg] | string | `max` | no | +| datalakestore_status_timeframe | Monitor timeframe for Datalake Store status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | | delay | Delay in seconds for the metric evaluation | string | `900` | no | | environment | Architecture environment | string | - | yes | | eventhub_errors_rate_message | Custom message for Event Hub errors monitor | string | `` | no | diff --git a/cloud/azure/cosmosdb/README.md b/cloud/azure/cosmosdb/README.md index 9c051e9..9912529 100644 --- a/cloud/azure/cosmosdb/README.md +++ b/cloud/azure/cosmosdb/README.md @@ -63,7 +63,6 @@ Inputs | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when a monitor is triggered | string | - | yes | -| subscription_id | ID of the subscription | string | - | yes | Related documentation --------------------- diff --git a/cloud/azure/datalakestore/README.md b/cloud/azure/datalakestore/README.md new file mode 100644 index 0000000..369976b --- /dev/null +++ b/cloud/azure/datalakestore/README.md @@ -0,0 +1,43 @@ +Service Bus Datadog monitor +=========================== + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-datalakestore" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/datalakestore?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a Datadog monitor with the following checks : + +* Service status check + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| delay | Delay in seconds for the metric evaluation | string | `900` | no | +| environment | Architecture environment | string | - | yes | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| status_message | Custom message for Datalake Store status monitor | string | `` | no | +| status_silenced | Groups to mute for Datalake Store status monitor | map | `` | no | +| status_time_aggregator | Monitor aggregator for Datalake Store status [available values: min, max or avg] | string | `max` | no | +| status_timeframe | Monitor timeframe for Datalake Store status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | + +Related documentation +--------------------- + +DataDog documentation : [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/) +You must search `datalake`, there is no integration for now. + +Azure metrics documentation : [https://docs.microsoft.com/fr-fr/azure/monitoring-and-diagnostics/monitoring-supported-metrics#microsoftdatalakestoreaccounts](https://docs.microsoft.com/fr-fr/azure/monitoring-and-diagnostics/monitoring-supported-metrics#microsoftdatalakestoreaccounts) diff --git a/cloud/azure/datalakestore/inputs.tf b/cloud/azure/datalakestore/inputs.tf new file mode 100644 index 0000000..30afe45 --- /dev/null +++ b/cloud/azure/datalakestore/inputs.tf @@ -0,0 +1,47 @@ +variable "environment" { + description = "Architecture environment" + type = "string" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +# Azure Datalake Store specific variables +variable "status_silenced" { + description = "Groups to mute for Datalake Store status monitor" + type = "map" + default = {} +} + +variable "status_message" { + description = "Custom message for Datalake Store status monitor" + type = "string" + default = "" +} + +variable "status_time_aggregator" { + description = "Monitor aggregator for Datalake Store status [available values: min, max or avg]" + type = "string" + default = "max" +} + +variable "status_timeframe" { + description = "Monitor timeframe for Datalake Store status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + default = "last_15m" +} diff --git a/cloud/azure/datalakestore/monitors-datalakestore.tf b/cloud/azure/datalakestore/monitors-datalakestore.tf new file mode 100644 index 0000000..fa4b67d --- /dev/null +++ b/cloud/azure/datalakestore/monitors-datalakestore.tf @@ -0,0 +1,34 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_servicebus:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "datalakestore_status" { + name = "[${var.environment}] Datalake Store is down" + message = "${coalesce(var.status_message, var.message)}" + + query = < Date: Fri, 6 Jul 2018 14:37:40 +0200 Subject: [PATCH 06/18] MON-237 - KeyVault monitors added --- cloud/azure/README.md | 9 +++ cloud/azure/inputs.tf | 50 +++++++++++++++ cloud/azure/keyvault/README.md | 49 +++++++++++++++ cloud/azure/keyvault/inputs.tf | 74 +++++++++++++++++++++++ cloud/azure/keyvault/monitors-keyvault.tf | 67 ++++++++++++++++++++ cloud/azure/monitors.tf | 22 +++++++ 6 files changed, 271 insertions(+) create mode 100644 cloud/azure/keyvault/README.md create mode 100644 cloud/azure/keyvault/inputs.tf create mode 100644 cloud/azure/keyvault/monitors-keyvault.tf diff --git a/cloud/azure/README.md b/cloud/azure/README.md index bbdafc7..65aa7a4 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -186,6 +186,15 @@ Inputs | iothub_total_devices_silenced | Groups to mute for IoT Hub total devices monitor | map | `` | no | | iothub_total_devices_time_aggregator | Monitor aggregator for IoT Hub total devices [available values: min, max or avg] | string | `min` | no | | iothub_total_devices_timeframe | Monitor timeframe for IoT Hub total devices [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | +| keyvault_api_result_message | Custom message for Key Vault API result monitor | string | `` | no | +| keyvault_api_result_silenced | Groups to mute for Key Vault API result monitor | map | `` | no | +| keyvault_api_result_threshold_critical | Critical threshold for Key Vault API result rate | string | `10` | no | +| keyvault_api_result_threshold_warning | Warning threshold for Key Vault API result rate | string | `30` | no | +| keyvault_api_result_timeframe | Monitor timeframe for Key Vault API result [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_30m` | no | +| keyvault_status_message | Custom message for Key Vault status monitor | string | `` | no | +| keyvault_status_silenced | Groups to mute for Key Vault status monitor | map | `` | no | +| keyvault_status_time_aggregator | Monitor aggregator for Key Vault status [available values: min, max or avg] | string | `max` | no | +| keyvault_status_timeframe | Monitor timeframe for Key Vault status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | | message | Message sent when a monitor is triggered | string | - | yes | | non_taggable_filter_tags | Tags used for filtering for components without tag support | string | `*` | no | | redis_evictedkeys_limit_message | Custom message for Redis evicted keys monitor | string | `` | no | diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf index fc6a5eb..c0381bb 100644 --- a/cloud/azure/inputs.tf +++ b/cloud/azure/inputs.tf @@ -1705,3 +1705,53 @@ variable "datalakestore_status_timeframe" { description = "Monitor timeframe for Datalake Store status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" default = "last_15m" } + +variable "keyvault_status_silenced" { + description = "Groups to mute for Key Vault status monitor" + type = "map" + default = {} +} + +variable "keyvault_status_message" { + description = "Custom message for Key Vault status monitor" + type = "string" + default = "" +} + +variable "keyvault_status_time_aggregator" { + description = "Monitor aggregator for Key Vault status [available values: min, max or avg]" + type = "string" + default = "max" +} + +variable "keyvault_status_timeframe" { + description = "Monitor timeframe for Key Vault status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + default = "last_15m" +} + +variable "keyvault_api_result_silenced" { + description = "Groups to mute for Key Vault API result monitor" + type = "map" + default = {} +} + +variable "keyvault_api_result_message" { + description = "Custom message for Key Vault API result monitor" + type = "string" + default = "" +} + +variable "keyvault_api_result_timeframe" { + description = "Monitor timeframe for Key Vault API result [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + default = "last_30m" +} + +variable "keyvault_api_result_threshold_critical" { + description = "Critical threshold for Key Vault API result rate" + default = 10 +} + +variable "keyvault_api_result_threshold_warning" { + description = "Warning threshold for Key Vault API result rate" + default = 30 +} diff --git a/cloud/azure/keyvault/README.md b/cloud/azure/keyvault/README.md new file mode 100644 index 0000000..20d1a6a --- /dev/null +++ b/cloud/azure/keyvault/README.md @@ -0,0 +1,49 @@ +Key Vault Datadog monitor +========================= + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-keyvault" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/keyvault?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a Datadog monitor with the following checks : + +* Service status check +* API result rate + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| api_result_message | Custom message for Key Vault API result monitor | string | `` | no | +| api_result_silenced | Groups to mute for Key Vault API result monitor | map | `` | no | +| api_result_threshold_critical | Critical threshold for Key Vault API result rate | string | `10` | no | +| api_result_threshold_warning | Warning threshold for Key Vault API result rate | string | `30` | no | +| api_result_timeframe | Monitor timeframe for Key Vault API result [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_30m` | no | +| delay | Delay in seconds for the metric evaluation | string | `900` | no | +| environment | Architecture environment | string | - | yes | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| status_message | Custom message for Key Vault status monitor | string | `` | no | +| status_silenced | Groups to mute for Key Vault status monitor | map | `` | no | +| status_time_aggregator | Monitor aggregator for Key Vault status [available values: min, max or avg] | string | `max` | no | +| status_timeframe | Monitor timeframe for Key Vault status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | + +Related documentation +--------------------- + +DataDog documentation : [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/) +You must search `keyvault`, there is no integration for now. + +Azure metrics documentation : [https://docs.microsoft.com/fr-fr/azure/monitoring-and-diagnostics/monitoring-supported-metrics#microsoftkeyvaultvaults](https://docs.microsoft.com/fr-fr/azure/monitoring-and-diagnostics/monitoring-supported-metrics#microsoftkeyvaultvaults) diff --git a/cloud/azure/keyvault/inputs.tf b/cloud/azure/keyvault/inputs.tf new file mode 100644 index 0000000..6fac667 --- /dev/null +++ b/cloud/azure/keyvault/inputs.tf @@ -0,0 +1,74 @@ +variable "environment" { + description = "Architecture environment" + type = "string" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +# Azure Key Vault specific variables +variable "status_silenced" { + description = "Groups to mute for Key Vault status monitor" + type = "map" + default = {} +} + +variable "status_message" { + description = "Custom message for Key Vault status monitor" + type = "string" + default = "" +} + +variable "status_time_aggregator" { + description = "Monitor aggregator for Key Vault status [available values: min, max or avg]" + type = "string" + default = "max" +} + +variable "status_timeframe" { + description = "Monitor timeframe for Key Vault status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + default = "last_15m" +} + +variable "api_result_silenced" { + description = "Groups to mute for Key Vault API result monitor" + type = "map" + default = {} +} + +variable "api_result_message" { + description = "Custom message for Key Vault API result monitor" + type = "string" + default = "" +} + +variable "api_result_timeframe" { + description = "Monitor timeframe for Key Vault API result [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + default = "last_30m" +} + +variable "api_result_threshold_critical" { + description = "Critical threshold for Key Vault API result rate" + default = 10 +} + +variable "api_result_threshold_warning" { + description = "Warning threshold for Key Vault API result rate" + default = 30 +} diff --git a/cloud/azure/keyvault/monitors-keyvault.tf b/cloud/azure/keyvault/monitors-keyvault.tf new file mode 100644 index 0000000..f35c67d --- /dev/null +++ b/cloud/azure/keyvault/monitors-keyvault.tf @@ -0,0 +1,67 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_servicebus:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "keyvault_status" { + name = "[${var.environment}] Key Vault is down" + message = "${coalesce(var.status_message, var.message)}" + + query = < Date: Tue, 24 Jul 2018 12:27:12 +0200 Subject: [PATCH 07/18] MON-237 - README and outputs regenerated --- README.md | 3 ++ cloud/azure/cosmosdb/README.md | 51 ++++++++++++---------------- cloud/azure/cosmosdb/outputs.tf | 19 +++++++++++ cloud/azure/datalakestore/README.md | 28 ++++++++------- cloud/azure/datalakestore/outputs.tf | 4 +++ cloud/azure/keyvault/README.md | 31 ++++++++++------- cloud/azure/keyvault/outputs.tf | 9 +++++ cloud/azure/servicebus/README.md | 11 +++--- cloud/azure/servicebus/outputs.tf | 15 ++++++++ 9 files changed, 112 insertions(+), 59 deletions(-) create mode 100644 cloud/azure/cosmosdb/outputs.tf create mode 100644 cloud/azure/datalakestore/outputs.tf create mode 100644 cloud/azure/keyvault/outputs.tf diff --git a/README.md b/README.md index 1cc513d..4963bb6 100644 --- a/README.md +++ b/README.md @@ -84,8 +84,11 @@ The `//` is very important, it's a terraform specific syntax used to separate gi - [azure](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/azure/) - [apimanagement](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/azure/apimanagement/) - [app-services](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/azure/app-services/) + - [cosmosdb](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/azure/cosmosdb/) + - [datalakestore](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/azure/datalakestore/) - [eventhub](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/azure/eventhub/) - [iothubs](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/azure/iothubs/) + - [keyvault](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/azure/keyvault/) - [redis](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/azure/redis/) - [servicebus](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/azure/servicebus/) - [sql-database](https://bitbucket.org/morea/terraform.feature.datadog/src/master/cloud/azure/sql-database/) diff --git a/cloud/azure/cosmosdb/README.md b/cloud/azure/cosmosdb/README.md index 9912529..8db2c06 100644 --- a/cloud/azure/cosmosdb/README.md +++ b/cloud/azure/cosmosdb/README.md @@ -1,45 +1,27 @@ -Azure Cosmos DB DataDog monitors -================================ +# CLOUD AZURE COSMOSDB DataDog monitors -How to use this module ----------------------- +## How to use this module ``` -module "datadog-monitors-azure-cosmosdb" { +module "datadog-monitors-cloud-azure-cosmosdb" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/cosmosdb?ref={revision}" - message = "${module.datadog-message-alerting.alerting-message}" environment = "${var.environment}" + message = "${module.datadog-message-alerting.alerting-message}" } -``` - -How to define cosmos_db_ru_utilization_collection variable ----------------------------------------------------------- - -At the time this module is defined, we can't define Cosmos DB collection with Terraform, so we have to define a variable making the connection between collections and RU max -``` -variable cosmos_db_ru_utilization_collection { - type = "map" - default = { - "collection_1" = "ru_max_1" - "collection_2" = "ru_max_2" - ... - } -} ``` -Purpose -------- -Creates a DataDog monitors with the following checks : +## Purpose -* No request -* Too many 4xx requests -* Too many 5xx requests -* Collection RU utilization +Creates DataDog monitors with the following checks: -Inputs ------- +- Cosmos DB 4xx requests rate is high +- Cosmos DB 5xx requests rate is high +- Cosmos DB has no request +- Cosmos DB collection ${element(keys(var.cosmos_db_ru_utilization_collection),count.index)} RU utilization is high + +## Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| @@ -64,6 +46,15 @@ Inputs | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when a monitor is triggered | string | - | yes | +## Outputs + +| Name | Description | +|------|-------------| +| cosmos_db_4xx_requests_id | id for monitor cosmos_db_4xx_requests | +| cosmos_db_5xx_requests_id | id for monitor cosmos_db_5xx_requests | +| cosmos_db_ru_utilization_id | id for monitor cosmos_db_ru_utilization | +| cosmos_db_success_no_data_id | id for monitor cosmos_db_success_no_data | + Related documentation --------------------- diff --git a/cloud/azure/cosmosdb/outputs.tf b/cloud/azure/cosmosdb/outputs.tf new file mode 100644 index 0000000..0dd8c71 --- /dev/null +++ b/cloud/azure/cosmosdb/outputs.tf @@ -0,0 +1,19 @@ +output "cosmos_db_4xx_requests_id" { + description = "id for monitor cosmos_db_4xx_requests" + value = "${datadog_monitor.cosmos_db_4xx_requests.id}" +} + +output "cosmos_db_5xx_requests_id" { + description = "id for monitor cosmos_db_5xx_requests" + value = "${datadog_monitor.cosmos_db_5xx_requests.id}" +} + +output "cosmos_db_success_no_data_id" { + description = "id for monitor cosmos_db_success_no_data" + value = "${datadog_monitor.cosmos_db_success_no_data.id}" +} + +output "cosmos_db_ru_utilization_id" { + description = "id for monitor cosmos_db_ru_utilization" + value = "${datadog_monitor.cosmos_db_ru_utilization.id}" +} diff --git a/cloud/azure/datalakestore/README.md b/cloud/azure/datalakestore/README.md index 369976b..a96ffe9 100644 --- a/cloud/azure/datalakestore/README.md +++ b/cloud/azure/datalakestore/README.md @@ -1,26 +1,24 @@ -Service Bus Datadog monitor -=========================== +# CLOUD AZURE DATALAKESTORE DataDog monitors -How to use this module ----------------------- +## How to use this module ``` -module "datadog-monitors-azure-datalakestore" { +module "datadog-monitors-cloud-azure-datalakestore" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/datalakestore?ref={revision}" - message = "${module.datadog-message-alerting.alerting-message}" environment = "${var.environment}" + message = "${module.datadog-message-alerting.alerting-message}" } + ``` -Purpose -------- -Creates a Datadog monitor with the following checks : +## Purpose -* Service status check +Creates DataDog monitors with the following checks: -Inputs ------- +- Datalake Store is down + +## Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| @@ -34,6 +32,12 @@ Inputs | status_time_aggregator | Monitor aggregator for Datalake Store status [available values: min, max or avg] | string | `max` | no | | status_timeframe | Monitor timeframe for Datalake Store status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +## Outputs + +| Name | Description | +|------|-------------| +| datalakestore_status_id | id for monitor datalakestore_status | + Related documentation --------------------- diff --git a/cloud/azure/datalakestore/outputs.tf b/cloud/azure/datalakestore/outputs.tf new file mode 100644 index 0000000..1c2fac8 --- /dev/null +++ b/cloud/azure/datalakestore/outputs.tf @@ -0,0 +1,4 @@ +output "datalakestore_status_id" { + description = "id for monitor datalakestore_status" + value = "${datadog_monitor.datalakestore_status.id}" +} diff --git a/cloud/azure/keyvault/README.md b/cloud/azure/keyvault/README.md index 20d1a6a..3943bed 100644 --- a/cloud/azure/keyvault/README.md +++ b/cloud/azure/keyvault/README.md @@ -1,27 +1,25 @@ -Key Vault Datadog monitor -========================= +# CLOUD AZURE KEYVAULT DataDog monitors -How to use this module ----------------------- +## How to use this module ``` -module "datadog-monitors-azure-keyvault" { +module "datadog-monitors-cloud-azure-keyvault" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/keyvault?ref={revision}" - message = "${module.datadog-message-alerting.alerting-message}" environment = "${var.environment}" + message = "${module.datadog-message-alerting.alerting-message}" } + ``` -Purpose -------- -Creates a Datadog monitor with the following checks : +## Purpose -* Service status check -* API result rate +Creates DataDog monitors with the following checks: -Inputs ------- +- Key Vault is down +- Key Vault API result rate is low + +## Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| @@ -40,6 +38,13 @@ Inputs | status_time_aggregator | Monitor aggregator for Key Vault status [available values: min, max or avg] | string | `max` | no | | status_timeframe | Monitor timeframe for Key Vault status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +## Outputs + +| Name | Description | +|------|-------------| +| keyvault_api_result_id | id for monitor keyvault_api_result | +| keyvault_status_id | id for monitor keyvault_status | + Related documentation --------------------- diff --git a/cloud/azure/keyvault/outputs.tf b/cloud/azure/keyvault/outputs.tf new file mode 100644 index 0000000..0100295 --- /dev/null +++ b/cloud/azure/keyvault/outputs.tf @@ -0,0 +1,9 @@ +output "keyvault_status_id" { + description = "id for monitor keyvault_status" + value = "${datadog_monitor.keyvault_status.id}" +} + +output "keyvault_api_result_id" { + description = "id for monitor keyvault_api_result" + value = "${datadog_monitor.keyvault_api_result.id}" +} diff --git a/cloud/azure/servicebus/README.md b/cloud/azure/servicebus/README.md index 8c3fa7e..cca3b22 100644 --- a/cloud/azure/servicebus/README.md +++ b/cloud/azure/servicebus/README.md @@ -16,10 +16,10 @@ module "datadog-monitors-cloud-azure-servicebus" { Creates DataDog monitors with the following checks: -- Service status check -- No active connection -- Server errors rate -- User errors rate +- Service Bus is down +- Service Bus has no active connection +- Service Bus user errors rate is high +- Service Bus server errors rate is high ## Inputs @@ -55,6 +55,9 @@ Creates DataDog monitors with the following checks: | Name | Description | |------|-------------| +| service_bus_no_active_connections_id | id for monitor service_bus_no_active_connections | +| service_bus_server_errors_id | id for monitor service_bus_server_errors | +| service_bus_user_errors_id | id for monitor service_bus_user_errors | | servicebus_status_id | id for monitor servicebus_status | ## Related documentation diff --git a/cloud/azure/servicebus/outputs.tf b/cloud/azure/servicebus/outputs.tf index 843de78..cef391b 100644 --- a/cloud/azure/servicebus/outputs.tf +++ b/cloud/azure/servicebus/outputs.tf @@ -2,3 +2,18 @@ output "servicebus_status_id" { description = "id for monitor servicebus_status" value = "${datadog_monitor.servicebus_status.*.id}" } + +output "service_bus_no_active_connections_id" { + description = "id for monitor service_bus_no_active_connections" + value = "${datadog_monitor.service_bus_no_active_connections.id}" +} + +output "service_bus_user_errors_id" { + description = "id for monitor service_bus_user_errors" + value = "${datadog_monitor.service_bus_user_errors.id}" +} + +output "service_bus_server_errors_id" { + description = "id for monitor service_bus_server_errors" + value = "${datadog_monitor.service_bus_server_errors.id}" +} From 56d40bcb430c821fa04640982eb0b8c6c9b5757b Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Fri, 24 Aug 2018 17:35:24 +0200 Subject: [PATCH 08/18] MON-237 Monitors improvements and best practices fixes --- cloud/azure/cosmosdb/README.md | 26 ++- cloud/azure/cosmosdb/inputs.tf | 111 ++++++++++- cloud/azure/cosmosdb/modules.tf | 30 +++ cloud/azure/cosmosdb/monitors-cosmosdb.tf | 126 +++++++++---- cloud/azure/cosmosdb/outputs-custom.tf | 4 + cloud/azure/cosmosdb/outputs.tf | 10 +- cloud/azure/datalakestore/README.md | 4 +- cloud/azure/datalakestore/inputs.tf | 15 +- cloud/azure/datalakestore/modules.tf | 8 + .../datalakestore/monitors-datalakestore.tf | 18 +- cloud/azure/inputs.tf | 177 +++++++++++++++++- cloud/azure/keyvault/README.md | 8 +- cloud/azure/keyvault/inputs.tf | 68 ++++++- cloud/azure/keyvault/modules.tf | 19 ++ cloud/azure/keyvault/monitors-keyvault.tf | 66 ++++--- cloud/azure/monitors.tf | 57 +++++- cloud/azure/servicebus/README.md | 6 +- cloud/azure/servicebus/inputs.tf | 4 +- .../azure/servicebus/monitors-service-bus.tf | 22 +-- 19 files changed, 656 insertions(+), 123 deletions(-) create mode 100644 cloud/azure/cosmosdb/modules.tf create mode 100644 cloud/azure/cosmosdb/outputs-custom.tf create mode 100644 cloud/azure/datalakestore/modules.tf create mode 100644 cloud/azure/keyvault/modules.tf diff --git a/cloud/azure/cosmosdb/README.md b/cloud/azure/cosmosdb/README.md index 8db2c06..0b908dc 100644 --- a/cloud/azure/cosmosdb/README.md +++ b/cloud/azure/cosmosdb/README.md @@ -18,33 +18,52 @@ Creates DataDog monitors with the following checks: - Cosmos DB 4xx requests rate is high - Cosmos DB 5xx requests rate is high +- Cosmos DB collection ${element(keys(var.cosmos_db_ru_utilization_collections),count.index)} RU utilization is high - Cosmos DB has no request -- Cosmos DB collection ${element(keys(var.cosmos_db_ru_utilization_collection),count.index)} RU utilization is high +- Cosmos DB is down ## Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| +| cosmos_db_4xx_request_extra_tags | Extra tags for Cosmos DB 4xx requests monitor | list | `` | no | | cosmos_db_4xx_request_rate_threshold_critical | Critical threshold for Cosmos DB 4xx requests monitor | string | `80` | no | | cosmos_db_4xx_request_rate_threshold_warning | Warning threshold for Cosmos DB 4xx requests monitor | string | `50` | no | +| cosmos_db_4xx_request_time_aggregator | Monitor aggregator for Cosmos DB status [available values: min, max or avg] | string | `sum` | no | +| cosmos_db_4xx_request_timeframe | Monitor timeframe for Cosmos DB status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | | cosmos_db_4xx_requests_message | Custom message for Cosmos DB 4xx requests monitor | string | `` | no | | cosmos_db_4xx_requests_silenced | Groups to mute for Cosmos DB 4xx requests monitor | map | `` | no | +| cosmos_db_5xx_request_rate_extra_tags | Extra tags for Cosmos DB 5xx requests monitor | list | `` | no | | cosmos_db_5xx_request_rate_threshold_critical | Critical threshold for Cosmos DB 5xx requests monitor | string | `80` | no | | cosmos_db_5xx_request_rate_threshold_warning | Warning threshold for Cosmos DB 5xx requests monitor | string | `50` | no | +| cosmos_db_5xx_request_time_aggregator | Monitor aggregator for Cosmos DB status [available values: min, max or avg] | string | `sum` | no | +| cosmos_db_5xx_request_timeframe | Monitor timeframe for Cosmos DB status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | | cosmos_db_5xx_requests_message | Custom message for Cosmos DB 5xx requests monitor | string | `` | no | | cosmos_db_5xx_requests_silenced | Groups to mute for Cosmos DB 5xx requests monitor | map | `` | no | +| cosmos_db_no_request_extra_tags | Extra tags for Cosmos DB no request monitor | list | `` | no | | cosmos_db_no_request_message | Custom message for Cosmos DB no request monitor | string | `` | no | | cosmos_db_no_request_silenced | Groups to mute for Cosmos DB no request monitor | map | `` | no | -| cosmos_db_ru_utilization_collection | Group to associate Cosmos DB collection to RU max | map | - | yes | +| cosmos_db_no_request_time_aggregator | Monitor aggregator for Cosmos DB status [available values: min, max or avg] | string | `max` | no | +| cosmos_db_no_request_timeframe | Monitor timeframe for Cosmos DB status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| cosmos_db_ru_utilization_collections | Group to associate Cosmos DB collection to RU max | map | - | yes | +| cosmos_db_ru_utilization_extra_tags | Extra tags for Cosmos DB collection RU utilization monitor | list | `` | no | | cosmos_db_ru_utilization_message | Custom message for Cosmos DB collection RU utilization monitor | string | `` | no | | cosmos_db_ru_utilization_rate_threshold_critical | Critical threshold for Cosmos DB collection RU utilization monitor | string | `90` | no | | cosmos_db_ru_utilization_rate_threshold_warning | Warning threshold for Cosmos DB collection RU utilization monitor | string | `80` | no | | cosmos_db_ru_utilization_silenced | Groups to mute for Cosmos DB collection RU utilization monitor | map | `` | no | -| delay | Delay in seconds for the metric evaluation | string | `900` | no | +| cosmos_db_ru_utilization_time_aggregator | Monitor aggregator for Cosmos DB status [available values: min, max or avg] | string | `max` | no | +| cosmos_db_ru_utilization_timeframe | Monitor timeframe for Cosmos DB status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | | environment | Architecture environment | string | - | yes | +| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when a monitor is triggered | string | - | yes | +| new_host_delay | Delay in seconds before monitor new resource | string | `300` | no | +| status_extra_tags | Extra tags for Cosmos DB status monitor | list | `` | no | +| status_message | Custom message for Cosmos DB status monitor | string | `` | no | +| status_silenced | Groups to mute for Cosmos DB status monitor | map | `` | no | +| status_time_aggregator | Monitor aggregator for Cosmos DB status [available values: min, max or avg] | string | `max` | no | +| status_timeframe | Monitor timeframe for Cosmos DB status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | ## Outputs @@ -53,6 +72,7 @@ Creates DataDog monitors with the following checks: | cosmos_db_4xx_requests_id | id for monitor cosmos_db_4xx_requests | | cosmos_db_5xx_requests_id | id for monitor cosmos_db_5xx_requests | | cosmos_db_ru_utilization_id | id for monitor cosmos_db_ru_utilization | +| cosmos_db_status_id | id for monitor cosmos_db_status | | cosmos_db_success_no_data_id | id for monitor cosmos_db_success_no_data | Related documentation diff --git a/cloud/azure/cosmosdb/inputs.tf b/cloud/azure/cosmosdb/inputs.tf index e8d04bf..19f0939 100644 --- a/cloud/azure/cosmosdb/inputs.tf +++ b/cloud/azure/cosmosdb/inputs.tf @@ -17,12 +17,47 @@ variable "message" { description = "Message sent when a monitor is triggered" } -variable "delay" { +variable "evaluation_delay" { description = "Delay in seconds for the metric evaluation" default = 900 } +variable "new_host_delay" { + description = "Delay in seconds before monitor new resource" + default = 300 +} + # Azure CosmosDB specific variables +variable "status_silenced" { + description = "Groups to mute for Cosmos DB status monitor" + type = "map" + default = {} +} + +variable "status_extra_tags" { + description = "Extra tags for Cosmos DB status monitor" + type = "list" + default = [] +} + +variable "status_message" { + description = "Custom message for Cosmos DB status monitor" + type = "string" + default = "" +} + +variable "status_time_aggregator" { + description = "Monitor aggregator for Cosmos DB status [available values: min, max or avg]" + type = "string" + default = "max" +} + +variable "status_timeframe" { + description = "Monitor timeframe for Cosmos DB status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + variable "cosmos_db_4xx_requests_message" { description = "Custom message for Cosmos DB 4xx requests monitor" type = "string" @@ -45,6 +80,24 @@ variable "cosmos_db_4xx_request_rate_threshold_warning" { default = 50 } +variable "cosmos_db_4xx_request_extra_tags" { + description = "Extra tags for Cosmos DB 4xx requests monitor" + type = "list" + default = [] +} + +variable "cosmos_db_4xx_request_time_aggregator" { + description = "Monitor aggregator for Cosmos DB 4xx requests [available values: min, max or avg]" + type = "string" + default = "sum" +} + +variable "cosmos_db_4xx_request_timeframe" { + description = "Monitor timeframe for Cosmos DB 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + variable "cosmos_db_5xx_requests_message" { description = "Custom message for Cosmos DB 5xx requests monitor" type = "string" @@ -67,6 +120,24 @@ variable "cosmos_db_5xx_request_rate_threshold_warning" { default = 50 } +variable "cosmos_db_5xx_request_rate_extra_tags" { + description = "Extra tags for Cosmos DB 5xx requests monitor" + type = "list" + default = [] +} + +variable "cosmos_db_5xx_request_time_aggregator" { + description = "Monitor aggregator for Cosmos DB 5xx requests [available values: min, max or avg]" + type = "string" + default = "sum" +} + +variable "cosmos_db_5xx_request_timeframe" { + description = "Monitor timeframe for Cosmos DB 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + variable "cosmos_db_no_request_message" { description = "Custom message for Cosmos DB no request monitor" type = "string" @@ -79,6 +150,24 @@ variable "cosmos_db_no_request_silenced" { default = {} } +variable "cosmos_db_no_request_extra_tags" { + description = "Extra tags for Cosmos DB no request monitor" + type = "list" + default = [] +} + +variable "cosmos_db_no_request_time_aggregator" { + description = "Monitor aggregator for Cosmos DB no request [available values: min, max or avg]" + type = "string" + default = "max" +} + +variable "cosmos_db_no_request_timeframe" { + description = "Monitor timeframe for Cosmos DB no request [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + variable "cosmos_db_ru_utilization_message" { description = "Custom message for Cosmos DB collection RU utilization monitor" type = "string" @@ -101,7 +190,25 @@ variable "cosmos_db_ru_utilization_rate_threshold_warning" { default = 80 } -variable "cosmos_db_ru_utilization_collection" { +variable "cosmos_db_ru_utilization_extra_tags" { + description = "Extra tags for Cosmos DB collection RU utilization monitor" + type = "list" + default = [] +} + +variable "cosmos_db_ru_utilization_time_aggregator" { + description = "Monitor aggregator for Cosmos DB RU utilization [available values: min, max or avg]" + type = "string" + default = "min" +} + +variable "cosmos_db_ru_utilization_timeframe" { + description = "Monitor timeframe for Cosmos DB RU utilization [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "cosmos_db_ru_utilization_collections" { description = "Group to associate Cosmos DB collection to RU max" type = "map" } diff --git a/cloud/azure/cosmosdb/modules.tf b/cloud/azure/cosmosdb/modules.tf new file mode 100644 index 0000000..aa2ac12 --- /dev/null +++ b/cloud/azure/cosmosdb/modules.tf @@ -0,0 +1,30 @@ +module "filter-tags" { + source = "../../../common/filter-tags" + + environment = "${var.environment}" + resource = "cosmosdb" + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" +} + +module "filter-tags-statuscode" { + source = "../../../common/filter-tags" + + environment = "${var.environment}" + resource = "cosmosdb" + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom},statuscode:%s" + + extra_tags = ["statuscode:%s"] +} + +module "filter-tags-collection" { + source = "../../../common/filter-tags" + + environment = "${var.environment}" + resource = "cosmosdb" + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom},collectionname:%s" + + extra_tags = ["collectionname:%s"] +} diff --git a/cloud/azure/cosmosdb/monitors-cosmosdb.tf b/cloud/azure/cosmosdb/monitors-cosmosdb.tf index fb4b916..4b1e23b 100644 --- a/cloud/azure/cosmosdb/monitors-cosmosdb.tf +++ b/cloud/azure/cosmosdb/monitors-cosmosdb.tf @@ -1,9 +1,33 @@ -data "template_file" "filter" { - template = "$${filter}" +resource "datadog_monitor" "cosmos_db_status" { + name = "[${var.environment}] Cosmos DB is down" + message = "${coalesce(var.status_message, var.message)}" - vars { - filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_cosmosdb:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + query = < ${var.cosmos_db_4xx_request_rate_threshold_critical} EOF @@ -37,16 +75,16 @@ resource "datadog_monitor" "cosmos_db_4xx_requests" { silenced = "${var.cosmos_db_4xx_requests_silenced}" notify_no_data = false - evaluation_delay = "${var.delay}" + evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = "${var.delay}" + new_host_delay = "${var.new_host_delay}" - tags = ["env:${var.environment}", "resource:cosmos_db", "team:azure", "provider:azure"] + tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:cosmos_db", "team:claranet", "created-by:terraform", "${var.cosmos_db_4xx_request_extra_tags}"] } resource "datadog_monitor" "cosmos_db_5xx_requests" { @@ -54,10 +92,16 @@ resource "datadog_monitor" "cosmos_db_5xx_requests" { message = "${coalesce(var.cosmos_db_5xx_requests_message, var.message)}" query = < ${var.cosmos_db_5xx_request_rate_threshold_critical} EOF @@ -72,16 +116,16 @@ resource "datadog_monitor" "cosmos_db_5xx_requests" { silenced = "${var.cosmos_db_5xx_requests_silenced}" notify_no_data = false - evaluation_delay = "${var.delay}" + evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = "${var.delay}" + new_host_delay = "${var.new_host_delay}" - tags = ["env:${var.environment}", "resource:cosmos_db", "team:azure", "provider:azure"] + tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:cosmos_db", "team:claranet", "created-by:terraform", "${var.cosmos_db_5xx_request_rate_extra_tags}"] } resource "datadog_monitor" "cosmos_db_success_no_data" { @@ -89,8 +133,9 @@ resource "datadog_monitor" "cosmos_db_success_no_data" { message = "${coalesce(var.cosmos_db_no_request_message, var.message)}" query = < ${var.cosmos_db_ru_utilization_rate_threshold_critical} EOF @@ -134,14 +182,14 @@ resource "datadog_monitor" "cosmos_db_ru_utilization" { silenced = "${var.cosmos_db_ru_utilization_silenced}" notify_no_data = false - evaluation_delay = "${var.delay}" + evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = "${var.delay}" + new_host_delay = "${var.new_host_delay}" - tags = ["env:${var.environment}", "resource:cosmos_db", "collection:${element(keys(var.cosmos_db_ru_utilization_collection),count.index)}", "team:azure", "provider:azure"] + tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:cosmos_db", "team:claranet", "created-by:terraform", "${var.cosmos_db_ru_utilization_extra_tags}"] } diff --git a/cloud/azure/cosmosdb/outputs-custom.tf b/cloud/azure/cosmosdb/outputs-custom.tf new file mode 100644 index 0000000..1da4343 --- /dev/null +++ b/cloud/azure/cosmosdb/outputs-custom.tf @@ -0,0 +1,4 @@ +output "cosmos_db_ru_utilization_id" { + description = "id for monitor cosmos_db_ru_utilization" + value = "${datadog_monitor.cosmos_db_ru_utilization.*.id}" +} diff --git a/cloud/azure/cosmosdb/outputs.tf b/cloud/azure/cosmosdb/outputs.tf index 0dd8c71..1039a55 100644 --- a/cloud/azure/cosmosdb/outputs.tf +++ b/cloud/azure/cosmosdb/outputs.tf @@ -1,3 +1,8 @@ +output "cosmos_db_status_id" { + description = "id for monitor cosmos_db_status" + value = "${datadog_monitor.cosmos_db_status.id}" +} + output "cosmos_db_4xx_requests_id" { description = "id for monitor cosmos_db_4xx_requests" value = "${datadog_monitor.cosmos_db_4xx_requests.id}" @@ -12,8 +17,3 @@ output "cosmos_db_success_no_data_id" { description = "id for monitor cosmos_db_success_no_data" value = "${datadog_monitor.cosmos_db_success_no_data.id}" } - -output "cosmos_db_ru_utilization_id" { - description = "id for monitor cosmos_db_ru_utilization" - value = "${datadog_monitor.cosmos_db_ru_utilization.id}" -} diff --git a/cloud/azure/datalakestore/README.md b/cloud/azure/datalakestore/README.md index a96ffe9..9603957 100644 --- a/cloud/azure/datalakestore/README.md +++ b/cloud/azure/datalakestore/README.md @@ -22,11 +22,13 @@ Creates DataDog monitors with the following checks: | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| delay | Delay in seconds for the metric evaluation | string | `900` | no | | environment | Architecture environment | string | - | yes | +| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when a monitor is triggered | string | - | yes | +| new_host_delay | Delay in seconds before monitor new resource | string | `300` | no | +| status_extra_tags | Extra tags for Datalake Store status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | list | `` | no | | status_message | Custom message for Datalake Store status monitor | string | `` | no | | status_silenced | Groups to mute for Datalake Store status monitor | map | `` | no | | status_time_aggregator | Monitor aggregator for Datalake Store status [available values: min, max or avg] | string | `max` | no | diff --git a/cloud/azure/datalakestore/inputs.tf b/cloud/azure/datalakestore/inputs.tf index 30afe45..319056a 100644 --- a/cloud/azure/datalakestore/inputs.tf +++ b/cloud/azure/datalakestore/inputs.tf @@ -17,11 +17,16 @@ variable "message" { description = "Message sent when a monitor is triggered" } -variable "delay" { +variable "evaluation_delay" { description = "Delay in seconds for the metric evaluation" default = 900 } +variable "new_host_delay" { + description = "Delay in seconds before monitor new resource" + default = 300 +} + # Azure Datalake Store specific variables variable "status_silenced" { description = "Groups to mute for Datalake Store status monitor" @@ -43,5 +48,11 @@ variable "status_time_aggregator" { variable "status_timeframe" { description = "Monitor timeframe for Datalake Store status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" - default = "last_15m" + default = "last_5m" +} + +variable "status_extra_tags" { + description = "Extra tags for Datalake Store status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "list" + default = [] } diff --git a/cloud/azure/datalakestore/modules.tf b/cloud/azure/datalakestore/modules.tf new file mode 100644 index 0000000..2a526b2 --- /dev/null +++ b/cloud/azure/datalakestore/modules.tf @@ -0,0 +1,8 @@ +module "filter-tags" { + source = "../../../common/filter-tags" + + environment = "${var.environment}" + resource = "datalakestore" + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" +} diff --git a/cloud/azure/datalakestore/monitors-datalakestore.tf b/cloud/azure/datalakestore/monitors-datalakestore.tf index fa4b67d..b7c41b1 100644 --- a/cloud/azure/datalakestore/monitors-datalakestore.tf +++ b/cloud/azure/datalakestore/monitors-datalakestore.tf @@ -1,19 +1,11 @@ -data "template_file" "filter" { - template = "$${filter}" - - vars { - filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_servicebus:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" - } -} - resource "datadog_monitor" "datalakestore_status" { name = "[${var.environment}] Datalake Store is down" message = "${coalesce(var.status_message, var.message)}" query = <` | no | | api_result_message | Custom message for Key Vault API result monitor | string | `` | no | | api_result_silenced | Groups to mute for Key Vault API result monitor | map | `` | no | | api_result_threshold_critical | Critical threshold for Key Vault API result rate | string | `10` | no | | api_result_threshold_warning | Warning threshold for Key Vault API result rate | string | `30` | no | +| api_result_time_aggregator | Monitor aggregator for Key Vault API result [available values: min, max or avg] | string | `sum` | no | | api_result_timeframe | Monitor timeframe for Key Vault API result [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_30m` | no | -| delay | Delay in seconds for the metric evaluation | string | `900` | no | | environment | Architecture environment | string | - | yes | +| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when a monitor is triggered | string | - | yes | +| new_host_delay | Delay in seconds before monitor new resource | string | `300` | no | +| status_extra_tags | Extra tags for Key Vault status monitor | list | `` | no | | status_message | Custom message for Key Vault status monitor | string | `` | no | | status_silenced | Groups to mute for Key Vault status monitor | map | `` | no | | status_time_aggregator | Monitor aggregator for Key Vault status [available values: min, max or avg] | string | `max` | no | diff --git a/cloud/azure/keyvault/inputs.tf b/cloud/azure/keyvault/inputs.tf index 6fac667..1e31cec 100644 --- a/cloud/azure/keyvault/inputs.tf +++ b/cloud/azure/keyvault/inputs.tf @@ -17,11 +17,16 @@ variable "message" { description = "Message sent when a monitor is triggered" } -variable "delay" { +variable "evaluation_delay" { description = "Delay in seconds for the metric evaluation" default = 900 } +variable "new_host_delay" { + description = "Delay in seconds before monitor new resource" + default = 300 +} + # Azure Key Vault specific variables variable "status_silenced" { description = "Groups to mute for Key Vault status monitor" @@ -43,7 +48,13 @@ variable "status_time_aggregator" { variable "status_timeframe" { description = "Monitor timeframe for Key Vault status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" - default = "last_15m" + default = "last_5m" +} + +variable "status_extra_tags" { + description = "Extra tags for Key Vault status monitor" + type = "list" + default = [] } variable "api_result_silenced" { @@ -58,9 +69,15 @@ variable "api_result_message" { default = "" } +variable "api_result_time_aggregator" { + description = "Monitor aggregator for Key Vault API result [available values: min, max or avg]" + type = "string" + default = "sum" +} + variable "api_result_timeframe" { description = "Monitor timeframe for Key Vault API result [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" - default = "last_30m" + default = "last_5m" } variable "api_result_threshold_critical" { @@ -72,3 +89,48 @@ variable "api_result_threshold_warning" { description = "Warning threshold for Key Vault API result rate" default = 30 } + +variable "api_result_extra_tags" { + description = "Extra tags for Key Vault API result monitor" + type = "list" + default = [] +} + +variable "api_latency_silenced" { + description = "Groups to mute for Key Vault API latency monitor" + type = "map" + default = {} +} + +variable "api_latency_message" { + description = "Custom message for Key Vault API latency monitor" + type = "string" + default = "" +} + +variable "api_latency_time_aggregator" { + description = "Monitor aggregator for Key Vault API latency [available values: min, max or avg]" + type = "string" + default = "min" +} + +variable "api_latency_timeframe" { + description = "Monitor timeframe for Key Vault API latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + default = "last_5m" +} + +variable "api_latency_threshold_critical" { + description = "Critical threshold for Key Vault API latency rate" + default = 100 +} + +variable "api_latency_threshold_warning" { + description = "Warning threshold for Key Vault API latency rate" + default = 80 +} + +variable "api_latency_extra_tags" { + description = "Extra tags for Key Vault API latency monitor" + type = "list" + default = [] +} diff --git a/cloud/azure/keyvault/modules.tf b/cloud/azure/keyvault/modules.tf new file mode 100644 index 0000000..0c21a6a --- /dev/null +++ b/cloud/azure/keyvault/modules.tf @@ -0,0 +1,19 @@ +module "filter-tags" { + source = "../../../common/filter-tags" + + environment = "${var.environment}" + resource = "keyvault" + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" +} + +module "filter-tags-statuscode" { + source = "../../../common/filter-tags" + + environment = "${var.environment}" + resource = "cosmosdb" + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom},statuscode:%s" + + extra_tags = ["statuscode:%s"] +} diff --git a/cloud/azure/keyvault/monitors-keyvault.tf b/cloud/azure/keyvault/monitors-keyvault.tf index f35c67d..9f5703d 100644 --- a/cloud/azure/keyvault/monitors-keyvault.tf +++ b/cloud/azure/keyvault/monitors-keyvault.tf @@ -1,36 +1,28 @@ -data "template_file" "filter" { - template = "$${filter}" - - vars { - filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_servicebus:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" - } -} - resource "datadog_monitor" "keyvault_status" { name = "[${var.environment}] Key Vault is down" message = "${coalesce(var.status_message, var.message)}" query = < ${var.api_latency_threshold_critical} + EOF + + thresholds { + critical = "${var.api_latency_threshold_critical}" + warning = "${var.api_latency_threshold_warning}" + } + + type = "metric alert" + + silenced = "${var.api_latency_silenced}" + + notify_no_data = true + evaluation_delay = "${var.evaluation_delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = false + new_host_delay = "${var.new_host_delay}" + + tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:keyvault", "team:claranet", "created-by:terraform", "${var.api_latency_extra_tags}"] } diff --git a/cloud/azure/monitors.tf b/cloud/azure/monitors.tf index 025f649..a495dc8 100644 --- a/cloud/azure/monitors.tf +++ b/cloud/azure/monitors.tf @@ -390,36 +390,59 @@ module "streamanalytics" { module "cosmosdb" { source = "./cosmosdb" - environment = "${var.environment}" - message = "${var.message}" - delay = "${var.delay}" + environment = "${var.environment}" + message = "${var.message}" + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" + status_message = "${var.cosmos_db_status_message}" + status_silenced = "${var.cosmos_db_status_silenced}" + status_extra_tags = "${var.cosmos_db_status_extra_tags}" + status_time_aggregator = "${var.cosmos_db_status_time_aggregator}" + status_timeframe = "${var.cosmos_db_status_timeframe}" + cosmos_db_4xx_request_rate_threshold_critical = "${var.cosmos_db_4xx_request_rate_threshold_critical}" cosmos_db_4xx_request_rate_threshold_warning = "${var.cosmos_db_4xx_request_rate_threshold_warning}" cosmos_db_4xx_requests_message = "${var.cosmos_db_4xx_requests_message}" cosmos_db_4xx_requests_silenced = "${var.cosmos_db_4xx_requests_silenced}" + cosmos_db_4xx_request_extra_tags = "${var.cosmos_db_4xx_request_extra_tags}" + cosmos_db_4xx_request_time_aggregator = "${var.cosmos_db_4xx_request_time_aggregator}" + cosmos_db_4xx_request_timeframe = "${var.cosmos_db_4xx_request_timeframe}" cosmos_db_5xx_request_rate_threshold_critical = "${var.cosmos_db_5xx_request_rate_threshold_critical}" cosmos_db_5xx_request_rate_threshold_warning = "${var.cosmos_db_5xx_request_rate_threshold_warning}" cosmos_db_5xx_requests_message = "${var.cosmos_db_5xx_requests_message}" cosmos_db_5xx_requests_silenced = "${var.cosmos_db_5xx_requests_silenced}" + cosmos_db_5xx_request_rate_extra_tags = "${var.cosmos_db_5xx_request_rate_extra_tags}" + cosmos_db_5xx_request_time_aggregator = "${var.cosmos_db_5xx_request_time_aggregator}" + cosmos_db_5xx_request_timeframe = "${var.cosmos_db_5xx_request_timeframe}" + + cosmos_db_no_request_message = "${var.cosmos_db_no_request_message}" + cosmos_db_no_request_silenced = "${var.cosmos_db_no_request_silenced}" + cosmos_db_no_request_extra_tags = "${var.cosmos_db_no_request_extra_tags}" + cosmos_db_no_request_time_aggregator = "${var.cosmos_db_no_request_time_aggregator}" + cosmos_db_no_request_timeframe = "${var.cosmos_db_no_request_timeframe}" cosmos_db_ru_utilization_rate_threshold_critical = "${var.cosmos_db_ru_utilization_rate_threshold_critical}" cosmos_db_ru_utilization_rate_threshold_warning = "${var.cosmos_db_ru_utilization_rate_threshold_warning}" cosmos_db_ru_utilization_message = "${var.cosmos_db_ru_utilization_message}" cosmos_db_ru_utilization_silenced = "${var.cosmos_db_ru_utilization_silenced}" - cosmos_db_ru_utilization_collection = "${var.cosmos_db_ru_utilization_collection}" + cosmos_db_ru_utilization_extra_tags = "${var.cosmos_db_ru_utilization_extra_tags}" + cosmos_db_ru_utilization_time_aggregator = "${var.cosmos_db_ru_utilization_time_aggregator}" + cosmos_db_ru_utilization_timeframe = "${var.cosmos_db_ru_utilization_timeframe}" + cosmos_db_ru_utilization_collections = "${var.cosmos_db_ru_utilization_collections}" } module "datalakestore" { source = "./datalakestore" - environment = "${var.environment}" - message = "${var.message}" - delay = "${var.delay}" + environment = "${var.environment}" + message = "${var.message}" + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" @@ -428,14 +451,16 @@ module "datalakestore" { status_message = "${var.datalakestore_status_message}" status_timeframe = "${var.datalakestore_status_timeframe}" status_time_aggregator = "${var.datalakestore_status_time_aggregator}" + status_extra_tags = "${var.datalakestore_status_extra_tags}" } module "keyvault" { source = "./keyvault" - environment = "${var.environment}" - message = "${var.message}" - delay = "${var.delay}" + environment = "${var.environment}" + message = "${var.message}" + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" @@ -444,10 +469,22 @@ module "keyvault" { status_message = "${var.keyvault_status_message}" status_timeframe = "${var.keyvault_status_timeframe}" status_time_aggregator = "${var.keyvault_status_time_aggregator}" + status_extra_tags = "${var.keyvault_status_extra_tags}" + api_result_enabled = "${var.keyvault_api_result_enabled}" api_result_silenced = "${var.keyvault_api_result_silenced}" api_result_message = "${var.keyvault_api_result_message}" api_result_timeframe = "${var.keyvault_api_result_timeframe}" + api_result_time_aggregator = "${var.keyvault_api_result_time_aggregator}" api_result_threshold_critical = "${var.keyvault_api_result_threshold_critical}" api_result_threshold_warning = "${var.keyvault_api_result_threshold_warning}" + api_result_extra_tags = "${var.keyvault_api_result_extra_tags}" + + api_latency_silenced = "${var.keyvault_api_latency_silenced}" + api_latency_message = "${var.keyvault_api_latency_message}" + api_latency_timeframe = "${var.keyvault_api_latency_timeframe}" + api_latency_time_aggregator = "${var.keyvault_api_latency_time_aggregator}" + api_latency_threshold_critical = "${var.keyvault_api_latency_threshold_critical}" + api_latency_threshold_warning = "${var.keyvault_api_latency_threshold_warning}" + api_latency_extra_tags = "${var.keyvault_api_latency_extra_tags}" } diff --git a/cloud/azure/servicebus/README.md b/cloud/azure/servicebus/README.md index cca3b22..cd8682b 100644 --- a/cloud/azure/servicebus/README.md +++ b/cloud/azure/servicebus/README.md @@ -16,10 +16,10 @@ module "datadog-monitors-cloud-azure-servicebus" { Creates DataDog monitors with the following checks: -- Service Bus is down - Service Bus has no active connection -- Service Bus user errors rate is high +- Service Bus is down - Service Bus server errors rate is high +- Service Bus user errors rate is high ## Inputs @@ -30,6 +30,7 @@ Creates DataDog monitors with the following checks: | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when an alert is triggered | string | - | yes | +| new_host_delay | Delay in seconds before monitor new resource | string | `300` | no | | no_active_connections_message | Custom message for Service Bus status monitor | string | `` | no | | no_active_connections_silenced | Groups to mute for Service Bus status monitor | map | `` | no | | no_active_connections_time_aggregator | Monitor aggregator for Service Bus status [available values: min, max or avg] | string | `max` | no | @@ -39,7 +40,6 @@ Creates DataDog monitors with the following checks: | server_errors_threshold_critical | Critical threshold for Service Bus server errors monitor | string | `90` | no | | server_errors_threshold_warning | Warning threshold for Service Bus server errors monitor | string | `50` | no | | server_errors_timeframe | Monitor timeframe for Service Bus server errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | -| new_host_delay | Delay in seconds before monitor new resource | string | `300` | no | | status_extra_tags | Extra tags for Service Bus status monitor | list | `` | no | | status_message | Custom message for Service Bus status monitor | string | `` | no | | status_silenced | Groups to mute for Service Bus status monitor | map | `` | no | diff --git a/cloud/azure/servicebus/inputs.tf b/cloud/azure/servicebus/inputs.tf index ca8bbc4..8d6a454 100644 --- a/cloud/azure/servicebus/inputs.tf +++ b/cloud/azure/servicebus/inputs.tf @@ -56,7 +56,7 @@ variable "status_time_aggregator" { variable "status_timeframe" { description = "Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" - default = "last_15m" + default = "last_5m" } variable "no_active_connections_silenced" { @@ -80,7 +80,7 @@ variable "no_active_connections_time_aggregator" { variable "no_active_connections_timeframe" { description = "Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" type = "string" - default = "last_15m" + default = "last_5m" } variable "server_errors_message" { diff --git a/cloud/azure/servicebus/monitors-service-bus.tf b/cloud/azure/servicebus/monitors-service-bus.tf index 79acaf2..0a17b93 100644 --- a/cloud/azure/servicebus/monitors-service-bus.tf +++ b/cloud/azure/servicebus/monitors-service-bus.tf @@ -31,7 +31,7 @@ resource "datadog_monitor" "service_bus_no_active_connections" { query = < ${var.user_errors_threshold_critical} EOF @@ -74,14 +74,14 @@ resource "datadog_monitor" "service_bus_user_errors" { silenced = "${var.user_errors_silenced}" notify_no_data = false - evaluation_delay = "${var.delay}" + evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = "${var.delay}" + new_host_delay = "${var.new_host_delay}" tags = ["env:${var.environment}", "resource:servicebus", "team:azure", "provider:azure"] } @@ -92,8 +92,8 @@ resource "datadog_monitor" "service_bus_server_errors" { query = < ${var.server_errors_threshold_critical} EOF @@ -108,14 +108,14 @@ resource "datadog_monitor" "service_bus_server_errors" { silenced = "${var.server_errors_silenced}" notify_no_data = false - evaluation_delay = "${var.delay}" + evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = "${var.delay}" + new_host_delay = "${var.new_host_delay}" tags = ["env:${var.environment}", "resource:servicebus", "team:azure", "provider:azure"] } From e2ce23c0f514a32ca5a07a2a0f0b09ca82a650a5 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Mon, 27 Aug 2018 16:31:41 +0200 Subject: [PATCH 09/18] MON-162 Add `enabled` variable for new Azure monitors --- cloud/azure/cosmosdb/inputs.tf | 30 +++++++ cloud/azure/cosmosdb/monitors-cosmosdb.tf | 10 ++- cloud/azure/datalakestore/inputs.tf | 6 ++ .../datalakestore/monitors-datalakestore.tf | 2 + cloud/azure/inputs.tf | 78 +++++++++++++++++++ cloud/azure/keyvault/inputs.tf | 18 +++++ cloud/azure/keyvault/monitors-keyvault.tf | 6 ++ cloud/azure/monitors.tf | 8 ++ cloud/azure/servicebus/inputs.tf | 24 ++++++ .../azure/servicebus/monitors-service-bus.tf | 8 ++ 10 files changed, 189 insertions(+), 1 deletion(-) diff --git a/cloud/azure/cosmosdb/inputs.tf b/cloud/azure/cosmosdb/inputs.tf index 19f0939..891f2cc 100644 --- a/cloud/azure/cosmosdb/inputs.tf +++ b/cloud/azure/cosmosdb/inputs.tf @@ -28,6 +28,12 @@ variable "new_host_delay" { } # Azure CosmosDB specific variables +variable "status_enabled" { + description = "Flag to enable Cosmos DB status monitor" + type = "string" + default = "true" +} + variable "status_silenced" { description = "Groups to mute for Cosmos DB status monitor" type = "map" @@ -64,6 +70,12 @@ variable "cosmos_db_4xx_requests_message" { default = "" } +variable "cosmos_db_4xx_requests_enabled" { + description = "Flag to enable Cosmos DB 4xx requests monitor" + type = "string" + default = "true" +} + variable "cosmos_db_4xx_requests_silenced" { description = "Groups to mute for Cosmos DB 4xx requests monitor" type = "map" @@ -104,6 +116,12 @@ variable "cosmos_db_5xx_requests_message" { default = "" } +variable "cosmos_db_5xx_requests_enabled" { + description = "Flag to enable Cosmos DB 5xx requests monitor" + type = "string" + default = "true" +} + variable "cosmos_db_5xx_requests_silenced" { description = "Groups to mute for Cosmos DB 5xx requests monitor" type = "map" @@ -144,6 +162,12 @@ variable "cosmos_db_no_request_message" { default = "" } +variable "cosmos_db_no_request_enabled" { + description = "Flag to enable Cosmos DB no request monitor" + type = "string" + default = "true" +} + variable "cosmos_db_no_request_silenced" { description = "Groups to mute for Cosmos DB no request monitor" type = "map" @@ -174,6 +198,12 @@ variable "cosmos_db_ru_utilization_message" { default = "" } +variable "cosmos_db_ru_utilization_enabled" { + description = "Flag to enable Cosmos DB collection RU utilization monitor" + type = "string" + default = "true" +} + variable "cosmos_db_ru_utilization_silenced" { description = "Groups to mute for Cosmos DB collection RU utilization monitor" type = "map" diff --git a/cloud/azure/cosmosdb/monitors-cosmosdb.tf b/cloud/azure/cosmosdb/monitors-cosmosdb.tf index 4b1e23b..25213b9 100644 --- a/cloud/azure/cosmosdb/monitors-cosmosdb.tf +++ b/cloud/azure/cosmosdb/monitors-cosmosdb.tf @@ -1,4 +1,6 @@ resource "datadog_monitor" "cosmos_db_status" { + count = "${var.status_enabled ? 1 : 0}" + name = "[${var.environment}] Cosmos DB is down" message = "${coalesce(var.status_message, var.message)}" @@ -31,6 +33,8 @@ resource "datadog_monitor" "cosmos_db_status" { } resource "datadog_monitor" "cosmos_db_4xx_requests" { + count = "${var.cosmos_db_4xx_requests_enabled ? 1 : 0}" + name = "[${var.environment}] Cosmos DB 4xx requests rate is high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = "${coalesce(var.cosmos_db_4xx_requests_message, var.message)}" @@ -88,6 +92,8 @@ resource "datadog_monitor" "cosmos_db_4xx_requests" { } resource "datadog_monitor" "cosmos_db_5xx_requests" { + count = "${var.cosmos_db_5xx_requests_enabled ? 1 : 0}" + name = "[${var.environment}] Cosmos DB 5xx requests rate is high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = "${coalesce(var.cosmos_db_5xx_requests_message, var.message)}" @@ -129,6 +135,8 @@ resource "datadog_monitor" "cosmos_db_5xx_requests" { } resource "datadog_monitor" "cosmos_db_success_no_data" { + count = "${var.cosmos_db_no_request_enabled ? 1 : 0}" + name = "[${var.environment}] Cosmos DB has no request" message = "${coalesce(var.cosmos_db_no_request_message, var.message)}" @@ -157,7 +165,7 @@ resource "datadog_monitor" "cosmos_db_success_no_data" { } resource "datadog_monitor" "cosmos_db_ru_utilization" { - count = "${length(var.cosmos_db_ru_utilization_collections)}" + count = "${var.cosmos_db_ru_utilization_enabled ? length(var.cosmos_db_ru_utilization_collections) : 0}" name = "[${var.environment}] Cosmos DB collection ${element(keys(var.cosmos_db_ru_utilization_collections),count.index)} RU utilization is high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = "${coalesce(var.cosmos_db_ru_utilization_message, var.message)}" diff --git a/cloud/azure/datalakestore/inputs.tf b/cloud/azure/datalakestore/inputs.tf index 319056a..7a0183c 100644 --- a/cloud/azure/datalakestore/inputs.tf +++ b/cloud/azure/datalakestore/inputs.tf @@ -28,6 +28,12 @@ variable "new_host_delay" { } # Azure Datalake Store specific variables +variable "status_enabled" { + description = "Flag to enable Datalake Store status monitor" + type = "string" + default = "true" +} + variable "status_silenced" { description = "Groups to mute for Datalake Store status monitor" type = "map" diff --git a/cloud/azure/datalakestore/monitors-datalakestore.tf b/cloud/azure/datalakestore/monitors-datalakestore.tf index b7c41b1..db5f78f 100644 --- a/cloud/azure/datalakestore/monitors-datalakestore.tf +++ b/cloud/azure/datalakestore/monitors-datalakestore.tf @@ -1,4 +1,6 @@ resource "datadog_monitor" "datalakestore_status" { + count = "${var.status_enabled ? 1 : 0}" + name = "[${var.environment}] Datalake Store is down" message = "${coalesce(var.status_message, var.message)}" diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf index 969a51f..10a89d1 100644 --- a/cloud/azure/inputs.tf +++ b/cloud/azure/inputs.tf @@ -907,6 +907,12 @@ variable "redis_server_load_rate_threshold_warning" { } # Azure Service Bus specific variables +variable "servicebus_status_enabled" { + description = "Flag to enable Service Bus status monitor" + type = "string" + default = "true" +} + variable "servicebus_status_silenced" { description = "Groups to mute for Service Bus status monitor" type = "map" @@ -930,6 +936,12 @@ variable "servicebus_status_timeframe" { default = "last_5m" } +variable "servicebus_no_active_connections_enabled" { + description = "Flag to enable Service Bus status monitor" + type = "string" + default = "true" +} + variable "servicebus_no_active_connections_silenced" { description = "Groups to mute for Service Bus status monitor" type = "map" @@ -960,6 +972,12 @@ variable "servicebus_server_errors_message" { default = "" } +variable "servicebus_server_errors_enabled" { + description = "Flag to enable Service Bus server errors monitor" + type = "string" + default = "true" +} + variable "servicebus_server_errors_silenced" { description = "Groups to mute for Service Bus server errors monitor" type = "map" @@ -988,6 +1006,12 @@ variable "servicebus_user_errors_message" { default = "" } +variable "servicebus_user_errors_enabled" { + description = "Flag to enable Service Bus user errors monitor" + type = "string" + default = "true" +} + variable "servicebus_user_errors_silenced" { description = "Groups to mute for Service Bus user errors monitor" type = "map" @@ -1599,6 +1623,12 @@ variable "streamanalytics_runtime_errors_threshold_critical" { } # Azure CosmosDB specific variables +variable "cosmos_db_status_enabled" { + description = "Flag to enable Cosmos DB status monitor" + type = "string" + default = "true" +} + variable "cosmos_db_status_silenced" { description = "Groups to mute for Cosmos DB status monitor" type = "map" @@ -1635,6 +1665,12 @@ variable "cosmos_db_4xx_requests_message" { default = "" } +variable "cosmos_db_4xx_requests_enabled" { + description = "Flag to enable Cosmos DB 4xx requests monitor" + type = "string" + default = "true" +} + variable "cosmos_db_4xx_requests_silenced" { description = "Groups to mute for Cosmos DB 4xx requests monitor" type = "map" @@ -1675,6 +1711,12 @@ variable "cosmos_db_5xx_requests_message" { default = "" } +variable "cosmos_db_5xx_requests_enabled" { + description = "Flag to enable Cosmos DB 5xx requests monitor" + type = "string" + default = "true" +} + variable "cosmos_db_5xx_requests_silenced" { description = "Groups to mute for Cosmos DB 5xx requests monitor" type = "map" @@ -1715,6 +1757,12 @@ variable "cosmos_db_no_request_message" { default = "" } +variable "cosmos_db_no_request_enabled" { + description = "Flag to enable Cosmos DB no request monitor" + type = "string" + default = "true" +} + variable "cosmos_db_no_request_silenced" { description = "Groups to mute for Cosmos DB no request monitor" type = "map" @@ -1745,6 +1793,12 @@ variable "cosmos_db_ru_utilization_message" { default = "" } +variable "cosmos_db_ru_utilization_enabled" { + description = "Flag to enable Cosmos DB collection RU utilization monitor" + type = "string" + default = "true" +} + variable "cosmos_db_ru_utilization_silenced" { description = "Groups to mute for Cosmos DB collection RU utilization monitor" type = "map" @@ -1785,6 +1839,12 @@ variable "cosmos_db_ru_utilization_collections" { } # Azure Datalake Store specific variables +variable "datalakestore_status_enabled" { + description = "Flag to enable Datalake Store status monitor" + type = "string" + default = "true" +} + variable "datalakestore_status_silenced" { description = "Groups to mute for Datalake Store status monitor" type = "map" @@ -1814,6 +1874,12 @@ variable "datalakestore_status_extra_tags" { default = [] } +variable "keyvault_status_enabled" { + description = "Flag to enable Key Vault status monitor" + type = "string" + default = "true" +} + variable "keyvault_status_silenced" { description = "Groups to mute for Key Vault status monitor" type = "map" @@ -1843,6 +1909,12 @@ variable "keyvault_status_extra_tags" { default = [] } +variable "keyvault_api_result_enabled" { + description = "Flag to enable Key Vault API result monitor" + type = "string" + default = "true" +} + variable "keyvault_api_result_silenced" { description = "Groups to mute for Key Vault API result monitor" type = "map" @@ -1882,6 +1954,12 @@ variable "keyvault_api_result_extra_tags" { default = [] } +variable "keyvault_api_latency_enabled" { + description = "Flag to enable Key Vault API latency monitor" + type = "string" + default = "true" +} + variable "keyvault_api_latency_silenced" { description = "Groups to mute for Key Vault API latency monitor" type = "map" diff --git a/cloud/azure/keyvault/inputs.tf b/cloud/azure/keyvault/inputs.tf index 1e31cec..fa03318 100644 --- a/cloud/azure/keyvault/inputs.tf +++ b/cloud/azure/keyvault/inputs.tf @@ -28,6 +28,12 @@ variable "new_host_delay" { } # Azure Key Vault specific variables +variable "status_enabled" { + description = "Flag to enable Key Vault status monitor" + type = "string" + default = "true" +} + variable "status_silenced" { description = "Groups to mute for Key Vault status monitor" type = "map" @@ -57,6 +63,12 @@ variable "status_extra_tags" { default = [] } +variable "api_result_enabled" { + description = "Flag to enable Key Vault API result monitor" + type = "string" + default = "true" +} + variable "api_result_silenced" { description = "Groups to mute for Key Vault API result monitor" type = "map" @@ -96,6 +108,12 @@ variable "api_result_extra_tags" { default = [] } +variable "api_latency_enabled" { + description = "Flag to enable Key Vault API latency monitor" + type = "string" + default = "true" +} + variable "api_latency_silenced" { description = "Groups to mute for Key Vault API latency monitor" type = "map" diff --git a/cloud/azure/keyvault/monitors-keyvault.tf b/cloud/azure/keyvault/monitors-keyvault.tf index 9f5703d..4cf7d33 100644 --- a/cloud/azure/keyvault/monitors-keyvault.tf +++ b/cloud/azure/keyvault/monitors-keyvault.tf @@ -1,4 +1,6 @@ resource "datadog_monitor" "keyvault_status" { + count = "${var.status_enabled ? 1 : 0}" + name = "[${var.environment}] Key Vault is down" message = "${coalesce(var.status_message, var.message)}" @@ -26,6 +28,8 @@ resource "datadog_monitor" "keyvault_status" { } resource "datadog_monitor" "keyvault_api_result" { + count = "${var.api_result_enabled ? 1 : 0}" + name = "[${var.environment}] Key Vault API result rate is low {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = "${coalesce(var.status_message, var.message)}" @@ -59,6 +63,8 @@ resource "datadog_monitor" "keyvault_api_result" { } resource "datadog_monitor" "keyvault_api_latency" { + count = "${var.api_latency_enabled ? 1 : 0}" + name = "[${var.environment}] Key Vault API latency is high {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" message = "${coalesce(var.status_message, var.message)}" diff --git a/cloud/azure/monitors.tf b/cloud/azure/monitors.tf index a495dc8..b5559de 100644 --- a/cloud/azure/monitors.tf +++ b/cloud/azure/monitors.tf @@ -398,12 +398,14 @@ module "cosmosdb" { filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" + status_enabled = "${var.cosmos_db_status_enabled}" status_message = "${var.cosmos_db_status_message}" status_silenced = "${var.cosmos_db_status_silenced}" status_extra_tags = "${var.cosmos_db_status_extra_tags}" status_time_aggregator = "${var.cosmos_db_status_time_aggregator}" status_timeframe = "${var.cosmos_db_status_timeframe}" + cosmos_db_4xx_requests_enabled = "${var.cosmos_db_4xx_requests_enabled}" cosmos_db_4xx_request_rate_threshold_critical = "${var.cosmos_db_4xx_request_rate_threshold_critical}" cosmos_db_4xx_request_rate_threshold_warning = "${var.cosmos_db_4xx_request_rate_threshold_warning}" cosmos_db_4xx_requests_message = "${var.cosmos_db_4xx_requests_message}" @@ -412,6 +414,7 @@ module "cosmosdb" { cosmos_db_4xx_request_time_aggregator = "${var.cosmos_db_4xx_request_time_aggregator}" cosmos_db_4xx_request_timeframe = "${var.cosmos_db_4xx_request_timeframe}" + cosmos_db_5xx_requests_enabled = "${var.cosmos_db_5xx_requests_enabled}" cosmos_db_5xx_request_rate_threshold_critical = "${var.cosmos_db_5xx_request_rate_threshold_critical}" cosmos_db_5xx_request_rate_threshold_warning = "${var.cosmos_db_5xx_request_rate_threshold_warning}" cosmos_db_5xx_requests_message = "${var.cosmos_db_5xx_requests_message}" @@ -420,12 +423,14 @@ module "cosmosdb" { cosmos_db_5xx_request_time_aggregator = "${var.cosmos_db_5xx_request_time_aggregator}" cosmos_db_5xx_request_timeframe = "${var.cosmos_db_5xx_request_timeframe}" + cosmos_db_no_request_enabled = "${var.cosmos_db_no_request_enabled}" cosmos_db_no_request_message = "${var.cosmos_db_no_request_message}" cosmos_db_no_request_silenced = "${var.cosmos_db_no_request_silenced}" cosmos_db_no_request_extra_tags = "${var.cosmos_db_no_request_extra_tags}" cosmos_db_no_request_time_aggregator = "${var.cosmos_db_no_request_time_aggregator}" cosmos_db_no_request_timeframe = "${var.cosmos_db_no_request_timeframe}" + cosmos_db_ru_utilization_enabled = "${var.cosmos_db_ru_utilization_enabled}" cosmos_db_ru_utilization_rate_threshold_critical = "${var.cosmos_db_ru_utilization_rate_threshold_critical}" cosmos_db_ru_utilization_rate_threshold_warning = "${var.cosmos_db_ru_utilization_rate_threshold_warning}" cosmos_db_ru_utilization_message = "${var.cosmos_db_ru_utilization_message}" @@ -447,6 +452,7 @@ module "datalakestore" { filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" + status_enabled = "${var.datalakestore_status_enabled}" status_silenced = "${var.datalakestore_status_silenced}" status_message = "${var.datalakestore_status_message}" status_timeframe = "${var.datalakestore_status_timeframe}" @@ -465,6 +471,7 @@ module "keyvault" { filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" + status_enabled = "${var.keyvault_status_enabled}" status_silenced = "${var.keyvault_status_silenced}" status_message = "${var.keyvault_status_message}" status_timeframe = "${var.keyvault_status_timeframe}" @@ -480,6 +487,7 @@ module "keyvault" { api_result_threshold_warning = "${var.keyvault_api_result_threshold_warning}" api_result_extra_tags = "${var.keyvault_api_result_extra_tags}" + api_latency_enabled = "${var.keyvault_api_latency_enabled}" api_latency_silenced = "${var.keyvault_api_latency_silenced}" api_latency_message = "${var.keyvault_api_latency_message}" api_latency_timeframe = "${var.keyvault_api_latency_timeframe}" diff --git a/cloud/azure/servicebus/inputs.tf b/cloud/azure/servicebus/inputs.tf index 8d6a454..eccabf5 100644 --- a/cloud/azure/servicebus/inputs.tf +++ b/cloud/azure/servicebus/inputs.tf @@ -30,6 +30,12 @@ variable "filter_tags_custom" { } # Azure Service Bus specific variables +variable "status_enabled" { + description = "Flag to enable Service Bus status monitor" + type = "string" + default = "true" +} + variable "status_silenced" { description = "Groups to mute for Service Bus status monitor" type = "map" @@ -59,6 +65,12 @@ variable "status_timeframe" { default = "last_5m" } +variable "no_active_connections_enabled" { + description = "Flag to enable Service Bus status monitor" + type = "string" + default = "true" +} + variable "no_active_connections_silenced" { description = "Groups to mute for Service Bus status monitor" type = "map" @@ -89,6 +101,12 @@ variable "server_errors_message" { default = "" } +variable "server_errors_enabled" { + description = "Flag to enable Service Bus server errors monitor" + type = "string" + default = "true" +} + variable "server_errors_silenced" { description = "Groups to mute for Service Bus server errors monitor" type = "map" @@ -117,6 +135,12 @@ variable "user_errors_message" { default = "" } +variable "user_errors_enabled" { + description = "Flag to enable Service Bus user errors monitor" + type = "string" + default = "true" +} + variable "user_errors_silenced" { description = "Groups to mute for Service Bus user errors monitor" type = "map" diff --git a/cloud/azure/servicebus/monitors-service-bus.tf b/cloud/azure/servicebus/monitors-service-bus.tf index 0a17b93..2e953d0 100644 --- a/cloud/azure/servicebus/monitors-service-bus.tf +++ b/cloud/azure/servicebus/monitors-service-bus.tf @@ -1,4 +1,6 @@ resource "datadog_monitor" "servicebus_status" { + count = "${var.status_enabled ? 1 : 0}" + name = "[${var.environment}] Service Bus is down" message = "${coalesce(var.status_message, var.message)}" @@ -26,6 +28,8 @@ EOF } resource "datadog_monitor" "service_bus_no_active_connections" { + count = "${var.no_active_connections_enabled ? 1 : 0}" + name = "[${var.environment}] Service Bus has no active connection" message = "${coalesce(var.no_active_connections_message, var.message)}" @@ -53,6 +57,8 @@ resource "datadog_monitor" "service_bus_no_active_connections" { } resource "datadog_monitor" "service_bus_user_errors" { + count = "${var.user_errors_enabled ? 1 : 0}" + name = "[${var.environment}] Service Bus user errors rate is high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = "${coalesce(var.user_errors_message, var.message)}" @@ -87,6 +93,8 @@ resource "datadog_monitor" "service_bus_user_errors" { } resource "datadog_monitor" "service_bus_server_errors" { + count = "${var.server_errors_enabled ? 1 : 0}" + name = "[${var.environment}] Service Bus server errors rate is high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = "${coalesce(var.server_errors_message, var.message)}" From 49a8a22958964b990dfa5b401fd2b8ba6d230c9c Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Mon, 27 Aug 2018 17:59:01 +0200 Subject: [PATCH 10/18] MON-237 Fix Cosmos DB RU monitor & auto-update --- cloud/azure/cosmosdb/README.md | 25 ++++++++++++++--------- cloud/azure/cosmosdb/inputs.tf | 4 ++-- cloud/azure/cosmosdb/monitors-cosmosdb.tf | 4 ++-- cloud/azure/cosmosdb/outputs-custom.tf | 4 ---- cloud/azure/cosmosdb/outputs.tf | 5 +++++ cloud/azure/datalakestore/README.md | 3 ++- cloud/azure/inputs.tf | 4 ++-- cloud/azure/keyvault/README.md | 16 +++++++++++++-- cloud/azure/keyvault/outputs.tf | 5 +++++ cloud/azure/servicebus/README.md | 8 ++++++-- 10 files changed, 53 insertions(+), 25 deletions(-) delete mode 100644 cloud/azure/cosmosdb/outputs-custom.tf diff --git a/cloud/azure/cosmosdb/README.md b/cloud/azure/cosmosdb/README.md index 0b908dc..bfb9bc0 100644 --- a/cloud/azure/cosmosdb/README.md +++ b/cloud/azure/cosmosdb/README.md @@ -29,41 +29,46 @@ Creates DataDog monitors with the following checks: | cosmos_db_4xx_request_extra_tags | Extra tags for Cosmos DB 4xx requests monitor | list | `` | no | | cosmos_db_4xx_request_rate_threshold_critical | Critical threshold for Cosmos DB 4xx requests monitor | string | `80` | no | | cosmos_db_4xx_request_rate_threshold_warning | Warning threshold for Cosmos DB 4xx requests monitor | string | `50` | no | -| cosmos_db_4xx_request_time_aggregator | Monitor aggregator for Cosmos DB status [available values: min, max or avg] | string | `sum` | no | -| cosmos_db_4xx_request_timeframe | Monitor timeframe for Cosmos DB status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| cosmos_db_4xx_request_time_aggregator | Monitor aggregator for Cosmos DB 4xx requests [available values: min, max or avg] | string | `sum` | no | +| cosmos_db_4xx_request_timeframe | Monitor timeframe for Cosmos DB 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | +| cosmos_db_4xx_requests_enabled | Flag to enable Cosmos DB 4xx requests monitor | string | `true` | no | | cosmos_db_4xx_requests_message | Custom message for Cosmos DB 4xx requests monitor | string | `` | no | | cosmos_db_4xx_requests_silenced | Groups to mute for Cosmos DB 4xx requests monitor | map | `` | no | | cosmos_db_5xx_request_rate_extra_tags | Extra tags for Cosmos DB 5xx requests monitor | list | `` | no | | cosmos_db_5xx_request_rate_threshold_critical | Critical threshold for Cosmos DB 5xx requests monitor | string | `80` | no | | cosmos_db_5xx_request_rate_threshold_warning | Warning threshold for Cosmos DB 5xx requests monitor | string | `50` | no | -| cosmos_db_5xx_request_time_aggregator | Monitor aggregator for Cosmos DB status [available values: min, max or avg] | string | `sum` | no | -| cosmos_db_5xx_request_timeframe | Monitor timeframe for Cosmos DB status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| cosmos_db_5xx_request_time_aggregator | Monitor aggregator for Cosmos DB 5xx requests [available values: min, max or avg] | string | `sum` | no | +| cosmos_db_5xx_request_timeframe | Monitor timeframe for Cosmos DB 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | +| cosmos_db_5xx_requests_enabled | Flag to enable Cosmos DB 5xx requests monitor | string | `true` | no | | cosmos_db_5xx_requests_message | Custom message for Cosmos DB 5xx requests monitor | string | `` | no | | cosmos_db_5xx_requests_silenced | Groups to mute for Cosmos DB 5xx requests monitor | map | `` | no | +| cosmos_db_no_request_enabled | Flag to enable Cosmos DB no request monitor | string | `true` | no | | cosmos_db_no_request_extra_tags | Extra tags for Cosmos DB no request monitor | list | `` | no | | cosmos_db_no_request_message | Custom message for Cosmos DB no request monitor | string | `` | no | | cosmos_db_no_request_silenced | Groups to mute for Cosmos DB no request monitor | map | `` | no | -| cosmos_db_no_request_time_aggregator | Monitor aggregator for Cosmos DB status [available values: min, max or avg] | string | `max` | no | -| cosmos_db_no_request_timeframe | Monitor timeframe for Cosmos DB status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | -| cosmos_db_ru_utilization_collections | Group to associate Cosmos DB collection to RU max | map | - | yes | +| cosmos_db_no_request_time_aggregator | Monitor aggregator for Cosmos DB no request [available values: min, max or avg] | string | `max` | no | +| cosmos_db_no_request_timeframe | Monitor timeframe for Cosmos DB no request [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | +| cosmos_db_ru_utilization_collections | Group to associate Cosmos DB collection to RU max. RU value has to be correlated with the monitor timeframe | map | - | yes | +| cosmos_db_ru_utilization_enabled | Flag to enable Cosmos DB collection RU utilization monitor | string | `true` | no | | cosmos_db_ru_utilization_extra_tags | Extra tags for Cosmos DB collection RU utilization monitor | list | `` | no | | cosmos_db_ru_utilization_message | Custom message for Cosmos DB collection RU utilization monitor | string | `` | no | | cosmos_db_ru_utilization_rate_threshold_critical | Critical threshold for Cosmos DB collection RU utilization monitor | string | `90` | no | | cosmos_db_ru_utilization_rate_threshold_warning | Warning threshold for Cosmos DB collection RU utilization monitor | string | `80` | no | | cosmos_db_ru_utilization_silenced | Groups to mute for Cosmos DB collection RU utilization monitor | map | `` | no | -| cosmos_db_ru_utilization_time_aggregator | Monitor aggregator for Cosmos DB status [available values: min, max or avg] | string | `max` | no | -| cosmos_db_ru_utilization_timeframe | Monitor timeframe for Cosmos DB status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| cosmos_db_ru_utilization_time_aggregator | Monitor aggregator for Cosmos DB RU utilization [available values: min, max or avg] | string | `sum` | no | +| cosmos_db_ru_utilization_timeframe | Monitor timeframe for Cosmos DB RU utilization [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | environment | Architecture environment | string | - | yes | | evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when a monitor is triggered | string | - | yes | | new_host_delay | Delay in seconds before monitor new resource | string | `300` | no | +| status_enabled | Flag to enable Cosmos DB status monitor | string | `true` | no | | status_extra_tags | Extra tags for Cosmos DB status monitor | list | `` | no | | status_message | Custom message for Cosmos DB status monitor | string | `` | no | | status_silenced | Groups to mute for Cosmos DB status monitor | map | `` | no | | status_time_aggregator | Monitor aggregator for Cosmos DB status [available values: min, max or avg] | string | `max` | no | -| status_timeframe | Monitor timeframe for Cosmos DB status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| status_timeframe | Monitor timeframe for Cosmos DB status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | ## Outputs diff --git a/cloud/azure/cosmosdb/inputs.tf b/cloud/azure/cosmosdb/inputs.tf index 891f2cc..84c610c 100644 --- a/cloud/azure/cosmosdb/inputs.tf +++ b/cloud/azure/cosmosdb/inputs.tf @@ -229,7 +229,7 @@ variable "cosmos_db_ru_utilization_extra_tags" { variable "cosmos_db_ru_utilization_time_aggregator" { description = "Monitor aggregator for Cosmos DB RU utilization [available values: min, max or avg]" type = "string" - default = "min" + default = "sum" } variable "cosmos_db_ru_utilization_timeframe" { @@ -239,6 +239,6 @@ variable "cosmos_db_ru_utilization_timeframe" { } variable "cosmos_db_ru_utilization_collections" { - description = "Group to associate Cosmos DB collection to RU max" + description = "Group to associate Cosmos DB collection to RU max. RU value has to be correlated with the monitor timeframe" type = "map" } diff --git a/cloud/azure/cosmosdb/monitors-cosmosdb.tf b/cloud/azure/cosmosdb/monitors-cosmosdb.tf index 25213b9..625764f 100644 --- a/cloud/azure/cosmosdb/monitors-cosmosdb.tf +++ b/cloud/azure/cosmosdb/monitors-cosmosdb.tf @@ -173,8 +173,8 @@ resource "datadog_monitor" "cosmos_db_ru_utilization" { query = < ${var.cosmos_db_ru_utilization_rate_threshold_critical} diff --git a/cloud/azure/cosmosdb/outputs-custom.tf b/cloud/azure/cosmosdb/outputs-custom.tf deleted file mode 100644 index 1da4343..0000000 --- a/cloud/azure/cosmosdb/outputs-custom.tf +++ /dev/null @@ -1,4 +0,0 @@ -output "cosmos_db_ru_utilization_id" { - description = "id for monitor cosmos_db_ru_utilization" - value = "${datadog_monitor.cosmos_db_ru_utilization.*.id}" -} diff --git a/cloud/azure/cosmosdb/outputs.tf b/cloud/azure/cosmosdb/outputs.tf index 1039a55..25c3159 100644 --- a/cloud/azure/cosmosdb/outputs.tf +++ b/cloud/azure/cosmosdb/outputs.tf @@ -17,3 +17,8 @@ output "cosmos_db_success_no_data_id" { description = "id for monitor cosmos_db_success_no_data" value = "${datadog_monitor.cosmos_db_success_no_data.id}" } + +output "cosmos_db_ru_utilization_id" { + description = "id for monitor cosmos_db_ru_utilization" + value = "${datadog_monitor.cosmos_db_ru_utilization.*.id}" +} diff --git a/cloud/azure/datalakestore/README.md b/cloud/azure/datalakestore/README.md index 9603957..9ac4a1f 100644 --- a/cloud/azure/datalakestore/README.md +++ b/cloud/azure/datalakestore/README.md @@ -28,11 +28,12 @@ Creates DataDog monitors with the following checks: | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when a monitor is triggered | string | - | yes | | new_host_delay | Delay in seconds before monitor new resource | string | `300` | no | +| status_enabled | Flag to enable Datalake Store status monitor | string | `true` | no | | status_extra_tags | Extra tags for Datalake Store status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | list | `` | no | | status_message | Custom message for Datalake Store status monitor | string | `` | no | | status_silenced | Groups to mute for Datalake Store status monitor | map | `` | no | | status_time_aggregator | Monitor aggregator for Datalake Store status [available values: min, max or avg] | string | `max` | no | -| status_timeframe | Monitor timeframe for Datalake Store status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| status_timeframe | Monitor timeframe for Datalake Store status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | ## Outputs diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf index 10a89d1..7d80b2a 100644 --- a/cloud/azure/inputs.tf +++ b/cloud/azure/inputs.tf @@ -1824,7 +1824,7 @@ variable "cosmos_db_ru_utilization_extra_tags" { variable "cosmos_db_ru_utilization_time_aggregator" { description = "Monitor aggregator for Cosmos DB RU utilization [available values: min, max or avg]" type = "string" - default = "avg" + default = "sum" } variable "cosmos_db_ru_utilization_timeframe" { @@ -1834,7 +1834,7 @@ variable "cosmos_db_ru_utilization_timeframe" { } variable "cosmos_db_ru_utilization_collections" { - description = "Group to associate Cosmos DB collection to RU max" + description = "Group to associate Cosmos DB collection to RU max. RU value has to be correlated with the monitor timeframe" type = "map" } diff --git a/cloud/azure/keyvault/README.md b/cloud/azure/keyvault/README.md index 9072c21..b3867de 100644 --- a/cloud/azure/keyvault/README.md +++ b/cloud/azure/keyvault/README.md @@ -16,6 +16,7 @@ module "datadog-monitors-cloud-azure-keyvault" { Creates DataDog monitors with the following checks: +- Key Vault API latency is high - Key Vault API result rate is low - Key Vault is down @@ -23,29 +24,40 @@ Creates DataDog monitors with the following checks: | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| +| api_latency_enabled | Flag to enable Key Vault API latency monitor | string | `true` | no | +| api_latency_extra_tags | Extra tags for Key Vault API latency monitor | list | `` | no | +| api_latency_message | Custom message for Key Vault API latency monitor | string | `` | no | +| api_latency_silenced | Groups to mute for Key Vault API latency monitor | map | `` | no | +| api_latency_threshold_critical | Critical threshold for Key Vault API latency rate | string | `100` | no | +| api_latency_threshold_warning | Warning threshold for Key Vault API latency rate | string | `80` | no | +| api_latency_time_aggregator | Monitor aggregator for Key Vault API latency [available values: min, max or avg] | string | `min` | no | +| api_latency_timeframe | Monitor timeframe for Key Vault API latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | +| api_result_enabled | Flag to enable Key Vault API result monitor | string | `true` | no | | api_result_extra_tags | Extra tags for Key Vault API result monitor | list | `` | no | | api_result_message | Custom message for Key Vault API result monitor | string | `` | no | | api_result_silenced | Groups to mute for Key Vault API result monitor | map | `` | no | | api_result_threshold_critical | Critical threshold for Key Vault API result rate | string | `10` | no | | api_result_threshold_warning | Warning threshold for Key Vault API result rate | string | `30` | no | | api_result_time_aggregator | Monitor aggregator for Key Vault API result [available values: min, max or avg] | string | `sum` | no | -| api_result_timeframe | Monitor timeframe for Key Vault API result [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_30m` | no | +| api_result_timeframe | Monitor timeframe for Key Vault API result [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | environment | Architecture environment | string | - | yes | | evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when a monitor is triggered | string | - | yes | | new_host_delay | Delay in seconds before monitor new resource | string | `300` | no | +| status_enabled | Flag to enable Key Vault status monitor | string | `true` | no | | status_extra_tags | Extra tags for Key Vault status monitor | list | `` | no | | status_message | Custom message for Key Vault status monitor | string | `` | no | | status_silenced | Groups to mute for Key Vault status monitor | map | `` | no | | status_time_aggregator | Monitor aggregator for Key Vault status [available values: min, max or avg] | string | `max` | no | -| status_timeframe | Monitor timeframe for Key Vault status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| status_timeframe | Monitor timeframe for Key Vault status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | ## Outputs | Name | Description | |------|-------------| +| keyvault_api_latency_id | id for monitor keyvault_api_latency | | keyvault_api_result_id | id for monitor keyvault_api_result | | keyvault_status_id | id for monitor keyvault_status | diff --git a/cloud/azure/keyvault/outputs.tf b/cloud/azure/keyvault/outputs.tf index 0100295..a5a6178 100644 --- a/cloud/azure/keyvault/outputs.tf +++ b/cloud/azure/keyvault/outputs.tf @@ -7,3 +7,8 @@ output "keyvault_api_result_id" { description = "id for monitor keyvault_api_result" value = "${datadog_monitor.keyvault_api_result.id}" } + +output "keyvault_api_latency_id" { + description = "id for monitor keyvault_api_latency" + value = "${datadog_monitor.keyvault_api_latency.id}" +} diff --git a/cloud/azure/servicebus/README.md b/cloud/azure/servicebus/README.md index cd8682b..c810c21 100644 --- a/cloud/azure/servicebus/README.md +++ b/cloud/azure/servicebus/README.md @@ -31,20 +31,24 @@ Creates DataDog monitors with the following checks: | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when an alert is triggered | string | - | yes | | new_host_delay | Delay in seconds before monitor new resource | string | `300` | no | +| no_active_connections_enabled | Flag to enable Service Bus status monitor | string | `true` | no | | no_active_connections_message | Custom message for Service Bus status monitor | string | `` | no | | no_active_connections_silenced | Groups to mute for Service Bus status monitor | map | `` | no | | no_active_connections_time_aggregator | Monitor aggregator for Service Bus status [available values: min, max or avg] | string | `max` | no | -| no_active_connections_timeframe | Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| no_active_connections_timeframe | Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | +| server_errors_enabled | Flag to enable Service Bus server errors monitor | string | `true` | no | | server_errors_message | Custom message for Service Bus server errors monitor | string | `` | no | | server_errors_silenced | Groups to mute for Service Bus server errors monitor | map | `` | no | | server_errors_threshold_critical | Critical threshold for Service Bus server errors monitor | string | `90` | no | | server_errors_threshold_warning | Warning threshold for Service Bus server errors monitor | string | `50` | no | | server_errors_timeframe | Monitor timeframe for Service Bus server errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | +| status_enabled | Flag to enable Service Bus status monitor | string | `true` | no | | status_extra_tags | Extra tags for Service Bus status monitor | list | `` | no | | status_message | Custom message for Service Bus status monitor | string | `` | no | | status_silenced | Groups to mute for Service Bus status monitor | map | `` | no | | status_time_aggregator | Monitor aggregator for Service Bus status [available values: min, max or avg] | string | `max` | no | -| status_timeframe | Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | +| status_timeframe | Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | +| user_errors_enabled | Flag to enable Service Bus user errors monitor | string | `true` | no | | user_errors_message | Custom message for Service Bus user errors monitor | string | `` | no | | user_errors_silenced | Groups to mute for Service Bus user errors monitor | map | `` | no | | user_errors_threshold_critical | Critical threshold for Service Bus user errors monitor | string | `90` | no | From 5a6576676b85db904b590946c4e96d4104a27bde Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Tue, 28 Aug 2018 09:23:24 +0200 Subject: [PATCH 11/18] MON-237 Fix Vault latency monitor --- cloud/azure/keyvault/monitors-keyvault.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/keyvault/monitors-keyvault.tf b/cloud/azure/keyvault/monitors-keyvault.tf index 4cf7d33..1847517 100644 --- a/cloud/azure/keyvault/monitors-keyvault.tf +++ b/cloud/azure/keyvault/monitors-keyvault.tf @@ -83,7 +83,7 @@ resource "datadog_monitor" "keyvault_api_latency" { silenced = "${var.api_latency_silenced}" - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false From 2a87b780eec902947407e2215a0a6648e02575dc Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Tue, 28 Aug 2018 09:33:12 +0200 Subject: [PATCH 12/18] MON-237 Improve Cosmos DB Readme --- cloud/azure/cosmosdb/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud/azure/cosmosdb/README.md b/cloud/azure/cosmosdb/README.md index bfb9bc0..a8ff7e9 100644 --- a/cloud/azure/cosmosdb/README.md +++ b/cloud/azure/cosmosdb/README.md @@ -8,6 +8,12 @@ module "datadog-monitors-cloud-azure-cosmosdb" { environment = "${var.environment}" message = "${module.datadog-message-alerting.alerting-message}" + + # MyDocumentCollection is provisioned with 1000 RU/s in Azure so, + # we set the RU value for 5 minutes as input because 5m is the default evalutation timeframe + cosmos_db_ru_utilization_collections = { + "MyDocumentCollection" = 300000 # 1000 * 60 * 5 + } } ``` From c7fa59c5bec0e2d26f3ce377ad06a65c38ea1a03 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Tue, 28 Aug 2018 09:43:44 +0200 Subject: [PATCH 13/18] MON-237 Fix generated outputs --- cloud/azure/cosmosdb/outputs.tf | 8 ++++---- cloud/azure/datalakestore/outputs.tf | 2 +- cloud/azure/keyvault/outputs.tf | 6 +++--- cloud/azure/servicebus/outputs.tf | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cloud/azure/cosmosdb/outputs.tf b/cloud/azure/cosmosdb/outputs.tf index 25c3159..783ae8b 100644 --- a/cloud/azure/cosmosdb/outputs.tf +++ b/cloud/azure/cosmosdb/outputs.tf @@ -1,21 +1,21 @@ output "cosmos_db_status_id" { description = "id for monitor cosmos_db_status" - value = "${datadog_monitor.cosmos_db_status.id}" + value = "${datadog_monitor.cosmos_db_status.*.id}" } output "cosmos_db_4xx_requests_id" { description = "id for monitor cosmos_db_4xx_requests" - value = "${datadog_monitor.cosmos_db_4xx_requests.id}" + value = "${datadog_monitor.cosmos_db_4xx_requests.*.id}" } output "cosmos_db_5xx_requests_id" { description = "id for monitor cosmos_db_5xx_requests" - value = "${datadog_monitor.cosmos_db_5xx_requests.id}" + value = "${datadog_monitor.cosmos_db_5xx_requests.*.id}" } output "cosmos_db_success_no_data_id" { description = "id for monitor cosmos_db_success_no_data" - value = "${datadog_monitor.cosmos_db_success_no_data.id}" + value = "${datadog_monitor.cosmos_db_success_no_data.*.id}" } output "cosmos_db_ru_utilization_id" { diff --git a/cloud/azure/datalakestore/outputs.tf b/cloud/azure/datalakestore/outputs.tf index 1c2fac8..5a11cbf 100644 --- a/cloud/azure/datalakestore/outputs.tf +++ b/cloud/azure/datalakestore/outputs.tf @@ -1,4 +1,4 @@ output "datalakestore_status_id" { description = "id for monitor datalakestore_status" - value = "${datadog_monitor.datalakestore_status.id}" + value = "${datadog_monitor.datalakestore_status.*.id}" } diff --git a/cloud/azure/keyvault/outputs.tf b/cloud/azure/keyvault/outputs.tf index a5a6178..a6a4ca0 100644 --- a/cloud/azure/keyvault/outputs.tf +++ b/cloud/azure/keyvault/outputs.tf @@ -1,14 +1,14 @@ output "keyvault_status_id" { description = "id for monitor keyvault_status" - value = "${datadog_monitor.keyvault_status.id}" + value = "${datadog_monitor.keyvault_status.*.id}" } output "keyvault_api_result_id" { description = "id for monitor keyvault_api_result" - value = "${datadog_monitor.keyvault_api_result.id}" + value = "${datadog_monitor.keyvault_api_result.*.id}" } output "keyvault_api_latency_id" { description = "id for monitor keyvault_api_latency" - value = "${datadog_monitor.keyvault_api_latency.id}" + value = "${datadog_monitor.keyvault_api_latency.*.id}" } diff --git a/cloud/azure/servicebus/outputs.tf b/cloud/azure/servicebus/outputs.tf index cef391b..ad117a1 100644 --- a/cloud/azure/servicebus/outputs.tf +++ b/cloud/azure/servicebus/outputs.tf @@ -5,15 +5,15 @@ output "servicebus_status_id" { output "service_bus_no_active_connections_id" { description = "id for monitor service_bus_no_active_connections" - value = "${datadog_monitor.service_bus_no_active_connections.id}" + value = "${datadog_monitor.service_bus_no_active_connections.*.id}" } output "service_bus_user_errors_id" { description = "id for monitor service_bus_user_errors" - value = "${datadog_monitor.service_bus_user_errors.id}" + value = "${datadog_monitor.service_bus_user_errors.*.id}" } output "service_bus_server_errors_id" { description = "id for monitor service_bus_server_errors" - value = "${datadog_monitor.service_bus_server_errors.id}" + value = "${datadog_monitor.service_bus_server_errors.*.id}" } From dadc2a76048f49e6566d29a1f7a2a46ed42a2bfa Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Tue, 28 Aug 2018 17:52:38 +0200 Subject: [PATCH 14/18] MON-237 Improve cosmos DB monitors grouping --- cloud/azure/cosmosdb/monitors-cosmosdb.tf | 60 +++++++++++------------ 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/cloud/azure/cosmosdb/monitors-cosmosdb.tf b/cloud/azure/cosmosdb/monitors-cosmosdb.tf index 625764f..3442dcd 100644 --- a/cloud/azure/cosmosdb/monitors-cosmosdb.tf +++ b/cloud/azure/cosmosdb/monitors-cosmosdb.tf @@ -41,29 +41,29 @@ resource "datadog_monitor" "cosmos_db_4xx_requests" { query = < ${var.cosmos_db_4xx_request_rate_threshold_critical} @@ -100,13 +100,13 @@ resource "datadog_monitor" "cosmos_db_5xx_requests" { query = < ${var.cosmos_db_5xx_request_rate_threshold_critical} @@ -142,8 +142,8 @@ resource "datadog_monitor" "cosmos_db_success_no_data" { query = < Date: Wed, 29 Aug 2018 09:54:02 +0200 Subject: [PATCH 15/18] MON-237 Remove Cosmos DB no request monitor --- cloud/azure/README.md | 2 -- cloud/azure/cosmosdb/README.md | 8 ----- cloud/azure/cosmosdb/inputs.tf | 36 ----------------------- cloud/azure/cosmosdb/monitors-cosmosdb.tf | 30 ------------------- cloud/azure/cosmosdb/outputs.tf | 5 ---- cloud/azure/inputs.tf | 36 ----------------------- cloud/azure/monitors.tf | 7 ----- 7 files changed, 124 deletions(-) diff --git a/cloud/azure/README.md b/cloud/azure/README.md index 65aa7a4..daa36be 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -91,8 +91,6 @@ Inputs | cosmos_db_5xx_request_rate_threshold_warning | Warning threshold for Cosmos DB 5xx requests monitor | string | `50` | no | | cosmos_db_5xx_requests_message | Custom message for Cosmos DB 5xx requests monitor | string | `` | no | | cosmos_db_5xx_requests_silenced | Groups to mute for Cosmos DB 5xx requests monitor | map | `` | no | -| cosmos_db_no_request_message | Custom message for Cosmos DB no request monitor | string | `` | no | -| cosmos_db_no_request_silenced | Groups to mute for Cosmos DB no request monitor | map | `` | no | | cosmos_db_ru_utilization_collection | Group to associate Cosmos DB collection to RU max | map | - | yes | | cosmos_db_ru_utilization_message | Custom message for Cosmos DB collection RU utilization monitor | string | `` | no | | cosmos_db_ru_utilization_rate_threshold_critical | Critical threshold for Cosmos DB collection RU utilization monitor | string | `90` | no | diff --git a/cloud/azure/cosmosdb/README.md b/cloud/azure/cosmosdb/README.md index a8ff7e9..a8ea549 100644 --- a/cloud/azure/cosmosdb/README.md +++ b/cloud/azure/cosmosdb/README.md @@ -25,7 +25,6 @@ Creates DataDog monitors with the following checks: - Cosmos DB 4xx requests rate is high - Cosmos DB 5xx requests rate is high - Cosmos DB collection ${element(keys(var.cosmos_db_ru_utilization_collections),count.index)} RU utilization is high -- Cosmos DB has no request - Cosmos DB is down ## Inputs @@ -48,12 +47,6 @@ Creates DataDog monitors with the following checks: | cosmos_db_5xx_requests_enabled | Flag to enable Cosmos DB 5xx requests monitor | string | `true` | no | | cosmos_db_5xx_requests_message | Custom message for Cosmos DB 5xx requests monitor | string | `` | no | | cosmos_db_5xx_requests_silenced | Groups to mute for Cosmos DB 5xx requests monitor | map | `` | no | -| cosmos_db_no_request_enabled | Flag to enable Cosmos DB no request monitor | string | `true` | no | -| cosmos_db_no_request_extra_tags | Extra tags for Cosmos DB no request monitor | list | `` | no | -| cosmos_db_no_request_message | Custom message for Cosmos DB no request monitor | string | `` | no | -| cosmos_db_no_request_silenced | Groups to mute for Cosmos DB no request monitor | map | `` | no | -| cosmos_db_no_request_time_aggregator | Monitor aggregator for Cosmos DB no request [available values: min, max or avg] | string | `max` | no | -| cosmos_db_no_request_timeframe | Monitor timeframe for Cosmos DB no request [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | cosmos_db_ru_utilization_collections | Group to associate Cosmos DB collection to RU max. RU value has to be correlated with the monitor timeframe | map | - | yes | | cosmos_db_ru_utilization_enabled | Flag to enable Cosmos DB collection RU utilization monitor | string | `true` | no | | cosmos_db_ru_utilization_extra_tags | Extra tags for Cosmos DB collection RU utilization monitor | list | `` | no | @@ -84,7 +77,6 @@ Creates DataDog monitors with the following checks: | cosmos_db_5xx_requests_id | id for monitor cosmos_db_5xx_requests | | cosmos_db_ru_utilization_id | id for monitor cosmos_db_ru_utilization | | cosmos_db_status_id | id for monitor cosmos_db_status | -| cosmos_db_success_no_data_id | id for monitor cosmos_db_success_no_data | Related documentation --------------------- diff --git a/cloud/azure/cosmosdb/inputs.tf b/cloud/azure/cosmosdb/inputs.tf index 84c610c..a689131 100644 --- a/cloud/azure/cosmosdb/inputs.tf +++ b/cloud/azure/cosmosdb/inputs.tf @@ -156,42 +156,6 @@ variable "cosmos_db_5xx_request_timeframe" { default = "last_5m" } -variable "cosmos_db_no_request_message" { - description = "Custom message for Cosmos DB no request monitor" - type = "string" - default = "" -} - -variable "cosmos_db_no_request_enabled" { - description = "Flag to enable Cosmos DB no request monitor" - type = "string" - default = "true" -} - -variable "cosmos_db_no_request_silenced" { - description = "Groups to mute for Cosmos DB no request monitor" - type = "map" - default = {} -} - -variable "cosmos_db_no_request_extra_tags" { - description = "Extra tags for Cosmos DB no request monitor" - type = "list" - default = [] -} - -variable "cosmos_db_no_request_time_aggregator" { - description = "Monitor aggregator for Cosmos DB no request [available values: min, max or avg]" - type = "string" - default = "max" -} - -variable "cosmos_db_no_request_timeframe" { - description = "Monitor timeframe for Cosmos DB no request [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" - type = "string" - default = "last_5m" -} - variable "cosmos_db_ru_utilization_message" { description = "Custom message for Cosmos DB collection RU utilization monitor" type = "string" diff --git a/cloud/azure/cosmosdb/monitors-cosmosdb.tf b/cloud/azure/cosmosdb/monitors-cosmosdb.tf index 3442dcd..7f64696 100644 --- a/cloud/azure/cosmosdb/monitors-cosmosdb.tf +++ b/cloud/azure/cosmosdb/monitors-cosmosdb.tf @@ -134,36 +134,6 @@ resource "datadog_monitor" "cosmos_db_5xx_requests" { tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:cosmos_db", "team:claranet", "created-by:terraform", "${var.cosmos_db_5xx_request_rate_extra_tags}"] } -resource "datadog_monitor" "cosmos_db_success_no_data" { - count = "${var.cosmos_db_no_request_enabled ? 1 : 0}" - - name = "[${var.environment}] Cosmos DB has no request" - message = "${coalesce(var.cosmos_db_no_request_message, var.message)}" - - query = < Date: Thu, 30 Aug 2018 10:44:45 +0200 Subject: [PATCH 16/18] MON-237 Remove Cosmos RU monitor --- cloud/azure/README.md | 5 --- cloud/azure/cosmosdb/README.md | 17 -------- cloud/azure/cosmosdb/inputs.tf | 51 ----------------------- cloud/azure/cosmosdb/modules.tf | 11 ----- cloud/azure/cosmosdb/monitors-cosmosdb.tf | 39 +---------------- cloud/azure/cosmosdb/outputs.tf | 5 --- cloud/azure/inputs.tf | 51 ----------------------- cloud/azure/monitors.tf | 10 ----- 8 files changed, 1 insertion(+), 188 deletions(-) diff --git a/cloud/azure/README.md b/cloud/azure/README.md index daa36be..c51727d 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -91,11 +91,6 @@ Inputs | cosmos_db_5xx_request_rate_threshold_warning | Warning threshold for Cosmos DB 5xx requests monitor | string | `50` | no | | cosmos_db_5xx_requests_message | Custom message for Cosmos DB 5xx requests monitor | string | `` | no | | cosmos_db_5xx_requests_silenced | Groups to mute for Cosmos DB 5xx requests monitor | map | `` | no | -| cosmos_db_ru_utilization_collection | Group to associate Cosmos DB collection to RU max | map | - | yes | -| cosmos_db_ru_utilization_message | Custom message for Cosmos DB collection RU utilization monitor | string | `` | no | -| cosmos_db_ru_utilization_rate_threshold_critical | Critical threshold for Cosmos DB collection RU utilization monitor | string | `90` | no | -| cosmos_db_ru_utilization_rate_threshold_warning | Warning threshold for Cosmos DB collection RU utilization monitor | string | `80` | no | -| cosmos_db_ru_utilization_silenced | Groups to mute for Cosmos DB collection RU utilization monitor | map | `` | no | | datalakestore_status_message | Custom message for Datalake Store status monitor | string | `` | no | | datalakestore_status_silenced | Groups to mute for Datalake Store status monitor | map | `` | no | | datalakestore_status_time_aggregator | Monitor aggregator for Datalake Store status [available values: min, max or avg] | string | `max` | no | diff --git a/cloud/azure/cosmosdb/README.md b/cloud/azure/cosmosdb/README.md index a8ea549..6de3209 100644 --- a/cloud/azure/cosmosdb/README.md +++ b/cloud/azure/cosmosdb/README.md @@ -8,12 +8,6 @@ module "datadog-monitors-cloud-azure-cosmosdb" { environment = "${var.environment}" message = "${module.datadog-message-alerting.alerting-message}" - - # MyDocumentCollection is provisioned with 1000 RU/s in Azure so, - # we set the RU value for 5 minutes as input because 5m is the default evalutation timeframe - cosmos_db_ru_utilization_collections = { - "MyDocumentCollection" = 300000 # 1000 * 60 * 5 - } } ``` @@ -24,7 +18,6 @@ Creates DataDog monitors with the following checks: - Cosmos DB 4xx requests rate is high - Cosmos DB 5xx requests rate is high -- Cosmos DB collection ${element(keys(var.cosmos_db_ru_utilization_collections),count.index)} RU utilization is high - Cosmos DB is down ## Inputs @@ -47,15 +40,6 @@ Creates DataDog monitors with the following checks: | cosmos_db_5xx_requests_enabled | Flag to enable Cosmos DB 5xx requests monitor | string | `true` | no | | cosmos_db_5xx_requests_message | Custom message for Cosmos DB 5xx requests monitor | string | `` | no | | cosmos_db_5xx_requests_silenced | Groups to mute for Cosmos DB 5xx requests monitor | map | `` | no | -| cosmos_db_ru_utilization_collections | Group to associate Cosmos DB collection to RU max. RU value has to be correlated with the monitor timeframe | map | - | yes | -| cosmos_db_ru_utilization_enabled | Flag to enable Cosmos DB collection RU utilization monitor | string | `true` | no | -| cosmos_db_ru_utilization_extra_tags | Extra tags for Cosmos DB collection RU utilization monitor | list | `` | no | -| cosmos_db_ru_utilization_message | Custom message for Cosmos DB collection RU utilization monitor | string | `` | no | -| cosmos_db_ru_utilization_rate_threshold_critical | Critical threshold for Cosmos DB collection RU utilization monitor | string | `90` | no | -| cosmos_db_ru_utilization_rate_threshold_warning | Warning threshold for Cosmos DB collection RU utilization monitor | string | `80` | no | -| cosmos_db_ru_utilization_silenced | Groups to mute for Cosmos DB collection RU utilization monitor | map | `` | no | -| cosmos_db_ru_utilization_time_aggregator | Monitor aggregator for Cosmos DB RU utilization [available values: min, max or avg] | string | `sum` | no | -| cosmos_db_ru_utilization_timeframe | Monitor timeframe for Cosmos DB RU utilization [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | environment | Architecture environment | string | - | yes | | evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | @@ -75,7 +59,6 @@ Creates DataDog monitors with the following checks: |------|-------------| | cosmos_db_4xx_requests_id | id for monitor cosmos_db_4xx_requests | | cosmos_db_5xx_requests_id | id for monitor cosmos_db_5xx_requests | -| cosmos_db_ru_utilization_id | id for monitor cosmos_db_ru_utilization | | cosmos_db_status_id | id for monitor cosmos_db_status | Related documentation diff --git a/cloud/azure/cosmosdb/inputs.tf b/cloud/azure/cosmosdb/inputs.tf index a689131..4d6cd55 100644 --- a/cloud/azure/cosmosdb/inputs.tf +++ b/cloud/azure/cosmosdb/inputs.tf @@ -155,54 +155,3 @@ variable "cosmos_db_5xx_request_timeframe" { type = "string" default = "last_5m" } - -variable "cosmos_db_ru_utilization_message" { - description = "Custom message for Cosmos DB collection RU utilization monitor" - type = "string" - default = "" -} - -variable "cosmos_db_ru_utilization_enabled" { - description = "Flag to enable Cosmos DB collection RU utilization monitor" - type = "string" - default = "true" -} - -variable "cosmos_db_ru_utilization_silenced" { - description = "Groups to mute for Cosmos DB collection RU utilization monitor" - type = "map" - default = {} -} - -variable "cosmos_db_ru_utilization_rate_threshold_critical" { - description = "Critical threshold for Cosmos DB collection RU utilization monitor" - default = 90 -} - -variable "cosmos_db_ru_utilization_rate_threshold_warning" { - description = "Warning threshold for Cosmos DB collection RU utilization monitor" - default = 80 -} - -variable "cosmos_db_ru_utilization_extra_tags" { - description = "Extra tags for Cosmos DB collection RU utilization monitor" - type = "list" - default = [] -} - -variable "cosmos_db_ru_utilization_time_aggregator" { - description = "Monitor aggregator for Cosmos DB RU utilization [available values: min, max or avg]" - type = "string" - default = "sum" -} - -variable "cosmos_db_ru_utilization_timeframe" { - description = "Monitor timeframe for Cosmos DB RU utilization [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" - type = "string" - default = "last_5m" -} - -variable "cosmos_db_ru_utilization_collections" { - description = "Group to associate Cosmos DB collection to RU max. RU value has to be correlated with the monitor timeframe" - type = "map" -} diff --git a/cloud/azure/cosmosdb/modules.tf b/cloud/azure/cosmosdb/modules.tf index aa2ac12..889e29a 100644 --- a/cloud/azure/cosmosdb/modules.tf +++ b/cloud/azure/cosmosdb/modules.tf @@ -17,14 +17,3 @@ module "filter-tags-statuscode" { extra_tags = ["statuscode:%s"] } - -module "filter-tags-collection" { - source = "../../../common/filter-tags" - - environment = "${var.environment}" - resource = "cosmosdb" - filter_tags_use_defaults = "${var.filter_tags_use_defaults}" - filter_tags_custom = "${var.filter_tags_custom},collectionname:%s" - - extra_tags = ["collectionname:%s"] -} diff --git a/cloud/azure/cosmosdb/monitors-cosmosdb.tf b/cloud/azure/cosmosdb/monitors-cosmosdb.tf index 7f64696..9df1dc4 100644 --- a/cloud/azure/cosmosdb/monitors-cosmosdb.tf +++ b/cloud/azure/cosmosdb/monitors-cosmosdb.tf @@ -38,6 +38,7 @@ resource "datadog_monitor" "cosmos_db_4xx_requests" { name = "[${var.environment}] Cosmos DB 4xx requests rate is high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = "${coalesce(var.cosmos_db_4xx_requests_message, var.message)}" + # List of available status codes : https://docs.microsoft.com/en-us/rest/api/cosmos-db/http-status-codes-for-cosmosdb query = < ${var.cosmos_db_ru_utilization_rate_threshold_critical} - EOF - - type = "metric alert" - - thresholds { - critical = "${var.cosmos_db_ru_utilization_rate_threshold_critical}" - warning = "${var.cosmos_db_ru_utilization_rate_threshold_warning}" - } - - silenced = "${var.cosmos_db_ru_utilization_silenced}" - - notify_no_data = false - evaluation_delay = "${var.evaluation_delay}" - renotify_interval = 0 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = "${var.new_host_delay}" - - tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:cosmos_db", "team:claranet", "created-by:terraform", "${var.cosmos_db_ru_utilization_extra_tags}"] -} diff --git a/cloud/azure/cosmosdb/outputs.tf b/cloud/azure/cosmosdb/outputs.tf index ea44e24..6901b39 100644 --- a/cloud/azure/cosmosdb/outputs.tf +++ b/cloud/azure/cosmosdb/outputs.tf @@ -12,8 +12,3 @@ output "cosmos_db_5xx_requests_id" { description = "id for monitor cosmos_db_5xx_requests" value = "${datadog_monitor.cosmos_db_5xx_requests.*.id}" } - -output "cosmos_db_ru_utilization_id" { - description = "id for monitor cosmos_db_ru_utilization" - value = "${datadog_monitor.cosmos_db_ru_utilization.*.id}" -} diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf index bb4ff3d..ed87c29 100644 --- a/cloud/azure/inputs.tf +++ b/cloud/azure/inputs.tf @@ -1751,57 +1751,6 @@ variable "cosmos_db_5xx_request_timeframe" { default = "last_5m" } -variable "cosmos_db_ru_utilization_message" { - description = "Custom message for Cosmos DB collection RU utilization monitor" - type = "string" - default = "" -} - -variable "cosmos_db_ru_utilization_enabled" { - description = "Flag to enable Cosmos DB collection RU utilization monitor" - type = "string" - default = "true" -} - -variable "cosmos_db_ru_utilization_silenced" { - description = "Groups to mute for Cosmos DB collection RU utilization monitor" - type = "map" - default = {} -} - -variable "cosmos_db_ru_utilization_rate_threshold_critical" { - description = "Critical threshold for Cosmos DB collection RU utilization monitor" - default = 90 -} - -variable "cosmos_db_ru_utilization_rate_threshold_warning" { - description = "Warning threshold for Cosmos DB collection RU utilization monitor" - default = 80 -} - -variable "cosmos_db_ru_utilization_extra_tags" { - description = "Extra tags for Cosmos DB collection RU utilization monitor" - type = "list" - default = [] -} - -variable "cosmos_db_ru_utilization_time_aggregator" { - description = "Monitor aggregator for Cosmos DB RU utilization [available values: min, max or avg]" - type = "string" - default = "sum" -} - -variable "cosmos_db_ru_utilization_timeframe" { - description = "Monitor timeframe for Cosmos DB RU utilization [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" - type = "string" - default = "last_5m" -} - -variable "cosmos_db_ru_utilization_collections" { - description = "Group to associate Cosmos DB collection to RU max. RU value has to be correlated with the monitor timeframe" - type = "map" -} - # Azure Datalake Store specific variables variable "datalakestore_status_enabled" { description = "Flag to enable Datalake Store status monitor" diff --git a/cloud/azure/monitors.tf b/cloud/azure/monitors.tf index 8fc659b..d044a79 100644 --- a/cloud/azure/monitors.tf +++ b/cloud/azure/monitors.tf @@ -422,16 +422,6 @@ module "cosmosdb" { cosmos_db_5xx_request_rate_extra_tags = "${var.cosmos_db_5xx_request_rate_extra_tags}" cosmos_db_5xx_request_time_aggregator = "${var.cosmos_db_5xx_request_time_aggregator}" cosmos_db_5xx_request_timeframe = "${var.cosmos_db_5xx_request_timeframe}" - - cosmos_db_ru_utilization_enabled = "${var.cosmos_db_ru_utilization_enabled}" - cosmos_db_ru_utilization_rate_threshold_critical = "${var.cosmos_db_ru_utilization_rate_threshold_critical}" - cosmos_db_ru_utilization_rate_threshold_warning = "${var.cosmos_db_ru_utilization_rate_threshold_warning}" - cosmos_db_ru_utilization_message = "${var.cosmos_db_ru_utilization_message}" - cosmos_db_ru_utilization_silenced = "${var.cosmos_db_ru_utilization_silenced}" - cosmos_db_ru_utilization_extra_tags = "${var.cosmos_db_ru_utilization_extra_tags}" - cosmos_db_ru_utilization_time_aggregator = "${var.cosmos_db_ru_utilization_time_aggregator}" - cosmos_db_ru_utilization_timeframe = "${var.cosmos_db_ru_utilization_timeframe}" - cosmos_db_ru_utilization_collections = "${var.cosmos_db_ru_utilization_collections}" } module "datalakestore" { From 25bd96fc9b0711441b464bf88a39c3b9e6b52d3d Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Thu, 30 Aug 2018 15:39:28 +0200 Subject: [PATCH 17/18] MON-237 Change Cosmos DB requests monitors aggregate --- cloud/azure/cosmosdb/monitors-cosmosdb.tf | 56 +++++++++++------------ 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/cloud/azure/cosmosdb/monitors-cosmosdb.tf b/cloud/azure/cosmosdb/monitors-cosmosdb.tf index 9df1dc4..d24c615 100644 --- a/cloud/azure/cosmosdb/monitors-cosmosdb.tf +++ b/cloud/azure/cosmosdb/monitors-cosmosdb.tf @@ -42,29 +42,29 @@ resource "datadog_monitor" "cosmos_db_4xx_requests" { query = < ${var.cosmos_db_4xx_request_rate_threshold_critical} @@ -101,13 +101,13 @@ resource "datadog_monitor" "cosmos_db_5xx_requests" { query = < ${var.cosmos_db_5xx_request_rate_threshold_critical} From 878380989f0c7e3158d681b1cccb0dffdc5cb9d1 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Fri, 14 Sep 2018 14:06:06 +0200 Subject: [PATCH 18/18] MON-237 Fix require_full_window property --- cloud/azure/cosmosdb/monitors-cosmosdb.tf | 4 ++-- cloud/azure/servicebus/monitors-service-bus.tf | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cloud/azure/cosmosdb/monitors-cosmosdb.tf b/cloud/azure/cosmosdb/monitors-cosmosdb.tf index d24c615..f4aa6bb 100644 --- a/cloud/azure/cosmosdb/monitors-cosmosdb.tf +++ b/cloud/azure/cosmosdb/monitors-cosmosdb.tf @@ -86,7 +86,7 @@ resource "datadog_monitor" "cosmos_db_4xx_requests" { timeout_h = 0 include_tags = true locked = false - require_full_window = true + require_full_window = false new_host_delay = "${var.new_host_delay}" tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:cosmos_db", "team:claranet", "created-by:terraform", "${var.cosmos_db_4xx_request_extra_tags}"] @@ -129,7 +129,7 @@ resource "datadog_monitor" "cosmos_db_5xx_requests" { timeout_h = 0 include_tags = true locked = false - require_full_window = true + require_full_window = false new_host_delay = "${var.new_host_delay}" tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:cosmos_db", "team:claranet", "created-by:terraform", "${var.cosmos_db_5xx_request_rate_extra_tags}"] diff --git a/cloud/azure/servicebus/monitors-service-bus.tf b/cloud/azure/servicebus/monitors-service-bus.tf index 2e953d0..82d0030 100644 --- a/cloud/azure/servicebus/monitors-service-bus.tf +++ b/cloud/azure/servicebus/monitors-service-bus.tf @@ -50,7 +50,7 @@ resource "datadog_monitor" "service_bus_no_active_connections" { timeout_h = 0 include_tags = true locked = false - require_full_window = true + require_full_window = false new_host_delay = "${var.new_host_delay}" tags = ["env:${var.environment}", "resource:servicebus", "team:azure", "provider:azure"] @@ -86,7 +86,7 @@ resource "datadog_monitor" "service_bus_user_errors" { timeout_h = 0 include_tags = true locked = false - require_full_window = true + require_full_window = false new_host_delay = "${var.new_host_delay}" tags = ["env:${var.environment}", "resource:servicebus", "team:azure", "provider:azure"] @@ -122,7 +122,7 @@ resource "datadog_monitor" "service_bus_server_errors" { timeout_h = 0 include_tags = true locked = false - require_full_window = true + require_full_window = false new_host_delay = "${var.new_host_delay}" tags = ["env:${var.environment}", "resource:servicebus", "team:azure", "provider:azure"]