From a0ac2d76295a81426f72f4f50736cf46a1cec2a5 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Thu, 8 Feb 2018 17:54:51 +0100 Subject: [PATCH 1/6] MON-79 Raise critical thresholds and add warning thresholds to avoid "bagot" alerting during NBH --- cloud/azure/inputs.tf | 45 +++++++++++++++++++ cloud/azure/monitors.tf | 9 ++++ cloud/azure/storage/inputs.tf | 45 +++++++++++++++++++ cloud/azure/storage/monitors-azure-storage.tf | 9 ++++ 4 files changed, 108 insertions(+) diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf index f56515b..941aa5a 100644 --- a/cloud/azure/inputs.tf +++ b/cloud/azure/inputs.tf @@ -312,46 +312,91 @@ variable "sqldatabase_deadlock_threshold_critical" { # Azure Storage specific variables variable "storage_availability_threshold_critical" { description = "Minimum acceptable percent of availability for a storage" + default = 50 +} + +variable "storage_availability_threshold_warning" { + description = "Warning regarding acceptable percent of availability for a storage" default = 90 } variable "storage_successful_requests_threshold_critical" { description = "Minimum acceptable percent of successful requests for a storage" + default = 50 +} + +variable "storage_successful_requests_threshold_warning" { + description = "Warning regarding acceptable percent of successful requests for a storage" default = 90 } variable "storage_latency_threshold_critical" { description = "Maximum acceptable end to end latency (ms) for a storage" + default = 2000 +} + +variable "storage_latency_threshold_warning" { + description = "Warning regarding acceptable end to end latency (ms) for a storage" default = 1000 } variable "storage_timeout_error_requests_threshold_critical" { description = "Maximum acceptable percent of timeout error requests for a storage" + default = 50 +} + +variable "storage_timeout_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of timeout error requests for a storage" default = 5 } variable "storage_network_error_requests_threshold_critical" { description = "Maximum acceptable percent of network error requests for a storage" + default = 50 +} + +variable "storage_network_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of network error requests for a storage" default = 5 } variable "storage_throttling_error_requests_threshold_critical" { description = "Maximum acceptable percent of throttling error requests for a storage" + default = 50 +} + +variable "storage_throttling_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of throttling error requests for a storage" default = 10 } variable "storage_server_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of server other error requests for a storage" + default = 50 +} + +variable "storage_server_other_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of server other error requests for a storage" default = 10 } variable "storage_client_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of client other error requests for a storage" + default = 50 +} + +variable "storage_client_other_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of client other error requests for a storage" default = 15 } variable "storage_authorization_error_requests_threshold_critical" { description = "Maximum acceptable percent of authorization error requests for a storage" + default = 50 +} + +variable "storage_authorization_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of authorization error requests for a storage" default = 15 } diff --git a/cloud/azure/monitors.tf b/cloud/azure/monitors.tf index beaa77b..650d3fd 100644 --- a/cloud/azure/monitors.tf +++ b/cloud/azure/monitors.tf @@ -135,14 +135,23 @@ module "storage" { filter_tags_custom = "${var.filter_tags_custom}" authorization_error_requests_threshold_critical = "${var.storage_authorization_error_requests_threshold_critical}" + authorization_error_requests_threshold_warning = "${var.storage_authorization_error_requests_threshold_warning}" availability_threshold_critical = "${var.storage_availability_threshold_critical}" + availability_threshold_warning = "${var.storage_availability_threshold_warning}" client_other_error_requests_threshold_critical = "${var.storage_client_other_error_requests_threshold_critical}" + client_other_error_requests_threshold_warning = "${var.storage_client_other_error_requests_threshold_warning}" latency_threshold_critical = "${var.storage_latency_threshold_critical}" + latency_threshold_warning = "${var.storage_latency_threshold_warning}" network_error_requests_threshold_critical = "${var.storage_network_error_requests_threshold_critical}" + network_error_requests_threshold_warning = "${var.storage_network_error_requests_threshold_warning}" server_other_error_requests_threshold_critical = "${var.storage_server_other_error_requests_threshold_critical}" + server_other_error_requests_threshold_warning = "${var.storage_server_other_error_requests_threshold_warning}" successful_requests_threshold_critical = "${var.storage_successful_requests_threshold_critical}" + successful_requests_threshold_warning = "${var.storage_successful_requests_threshold_warning}" throttling_error_requests_threshold_critical = "${var.storage_throttling_error_requests_threshold_critical}" + throttling_error_requests_threshold_warning = "${var.storage_throttling_error_requests_threshold_warning}" timeout_error_requests_threshold_critical = "${var.storage_timeout_error_requests_threshold_critical}" + timeout_error_requests_threshold_warning = "${var.storage_timeout_error_requests_threshold_warning}" } module "streamanalytics" { diff --git a/cloud/azure/storage/inputs.tf b/cloud/azure/storage/inputs.tf index e48df74..d210ca7 100644 --- a/cloud/azure/storage/inputs.tf +++ b/cloud/azure/storage/inputs.tf @@ -27,45 +27,90 @@ variable "filter_tags_custom" { # Azure Storage specific variable "availability_threshold_critical" { description = "Minimum acceptable percent of availability for a storage" + default = 50 +} + +variable "availability_threshold_warning" { + description = "Warning regarding acceptable percent of availability for a storage" default = 90 } variable "successful_requests_threshold_critical" { description = "Minimum acceptable percent of successful requests for a storage" + default = 50 +} + +variable "successful_requests_threshold_warning" { + description = "Warning regarding acceptable percent of successful requests for a storage" default = 90 } variable "latency_threshold_critical" { description = "Maximum acceptable end to end latency (ms) for a storage" + default = 2000 +} + +variable "latency_threshold_warning" { + description = "Warning regarding acceptable end to end latency (ms) for a storage" default = 1000 } variable "timeout_error_requests_threshold_critical" { description = "Maximum acceptable percent of timeout error requests for a storage" + default = 50 +} + +variable "timeout_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of timeout error requests for a storage" default = 5 } variable "network_error_requests_threshold_critical" { description = "Maximum acceptable percent of network error requests for a storage" + default = 50 +} + +variable "network_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of network error requests for a storage" default = 5 } variable "throttling_error_requests_threshold_critical" { description = "Maximum acceptable percent of throttling error requests for a storage" + default = 50 +} + +variable "throttling_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of throttling error requests for a storage" default = 10 } variable "server_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of server other error requests for a storage" + default = 50 +} + +variable "server_other_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of server other error requests for a storage" default = 10 } variable "client_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of client other error requests for a storage" + default = 50 +} + +variable "client_other_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of client other error requests for a storage" default = 15 } variable "authorization_error_requests_threshold_critical" { description = "Maximum acceptable percent of authorization error requests for a storage" + default = 50 +} + +variable "authorization_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of authorization error requests for a storage" default = 15 } diff --git a/cloud/azure/storage/monitors-azure-storage.tf b/cloud/azure/storage/monitors-azure-storage.tf index 0e5137c..d60c6ac 100644 --- a/cloud/azure/storage/monitors-azure-storage.tf +++ b/cloud/azure/storage/monitors-azure-storage.tf @@ -18,6 +18,7 @@ EOF thresholds { critical = "${var.availability_threshold_critical}" + warning = "${var.availability_threshold_warning}" } type = "metric alert" @@ -47,6 +48,7 @@ EOF thresholds { critical = "${var.successful_requests_threshold_critical}" + warning = "${var.successful_requests_threshold_warning}" } type = "metric alert" @@ -76,6 +78,7 @@ EOF thresholds { critical = "${var.latency_threshold_critical}" + warning = "${var.latency_threshold_warning}" } type = "metric alert" @@ -105,6 +108,7 @@ EOF thresholds { critical = "${var.timeout_error_requests_threshold_critical}" + warning = "${var.timeout_error_requests_threshold_warning}" } type = "metric alert" @@ -134,6 +138,7 @@ EOF thresholds { critical = "${var.network_error_requests_threshold_critical}" + warning = "${var.network_error_requests_threshold_warning}" } type = "metric alert" @@ -163,6 +168,7 @@ EOF thresholds { critical = "${var.throttling_error_requests_threshold_critical}" + warning = "${var.throttling_error_requests_threshold_warning}" } type = "metric alert" @@ -192,6 +198,7 @@ EOF thresholds { critical = "${var.server_other_error_requests_threshold_critical}" + warning = "${var.server_other_error_requests_threshold_warning}" } type = "metric alert" @@ -221,6 +228,7 @@ EOF thresholds { critical = "${var.client_other_error_requests_threshold_critical}" + warning = "${var.client_other_error_requests_threshold_warning}" } type = "metric alert" @@ -250,6 +258,7 @@ EOF thresholds { critical = "${var.authorization_error_requests_threshold_critical}" + warning = "${var.authorization_error_requests_threshold_warning}" } type = "metric alert" From f698efbe43248c0f11d9a1856e6ba7bb120bde6d Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Tue, 13 Feb 2018 17:39:42 +0100 Subject: [PATCH 2/6] MON-79 Update README --- cloud/azure/README.md | 31 ++++++++++++++++++++----------- cloud/azure/storage/README.md | 27 ++++++++++++++++++--------- 2 files changed, 38 insertions(+), 20 deletions(-) diff --git a/cloud/azure/README.md b/cloud/azure/README.md index 05bc25d..2b5ca45 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -35,12 +35,12 @@ Inputs | apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no | | apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no | | apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no | -| appservices_http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `90` | no | -| appservices_http_successful_requests_threshold_warning | Minimum warning acceptable percent of 2xx & 3xx requests | string | `95` | no | | appservices_http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no | | appservices_http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no | | appservices_http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no | | appservices_http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no | +| appservices_http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `90` | no | +| appservices_http_successful_requests_threshold_warning | Minimum warning acceptable percent of 2xx & 3xx requests | string | `95` | no | | appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | | appservices_response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | @@ -92,15 +92,24 @@ Inputs | sqldatabase_diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no | | sqldatabase_dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no | | sqldatabase_dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no | -| storage_authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `15` | no | -| storage_availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `90` | no | -| storage_client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `15` | no | -| storage_latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `1000` | no | -| storage_network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `5` | no | -| storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `10` | no | -| storage_successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `90` | no | -| storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `10` | no | -| storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `5` | no | +| storage_authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `50` | no | +| storage_authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `15` | no | +| storage_availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `50` | no | +| storage_availability_threshold_warning | Warning regarding acceptable percent of availability for a storage | string | `90` | no | +| storage_client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `50` | no | +| storage_client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `15` | no | +| storage_latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `2000` | no | +| storage_latency_threshold_warning | Warning regarding acceptable end to end latency (ms) for a storage | string | `1000` | no | +| storage_network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `50` | no | +| storage_network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `5` | no | +| storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `50` | no | +| storage_server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `10` | no | +| storage_successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `50` | no | +| storage_successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `90` | no | +| storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `50` | no | +| storage_throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `10` | no | +| storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `50` | no | +| storage_timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `5` | no | | streamanalytics_conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no | | streamanalytics_conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no | | streamanalytics_failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no | diff --git a/cloud/azure/storage/README.md b/cloud/azure/storage/README.md index b999f35..e5a93f6 100644 --- a/cloud/azure/storage/README.md +++ b/cloud/azure/storage/README.md @@ -32,20 +32,29 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `15` | no | -| availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `90` | no | -| client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `15` | no | +| authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `50` | no | +| authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `15` | no | +| availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `50` | no | +| availability_threshold_warning | Warning regarding acceptable percent of availability for a storage | string | `90` | no | +| client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `50` | no | +| client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `15` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | -| latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `1000` | no | +| latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `2000` | no | +| latency_threshold_warning | Warning regarding acceptable end to end latency (ms) for a storage | string | `1000` | no | | message | Message sent when a Redis monitor is triggered | string | - | yes | -| network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `5` | no | -| server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `10` | no | -| successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `90` | no | -| throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `10` | no | -| timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `5` | no | +| network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `50` | no | +| network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `5` | no | +| server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `50` | no | +| server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `10` | no | +| successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `50` | no | +| successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `90` | no | +| throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `50` | no | +| throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `10` | no | +| timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `50` | no | +| timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `5` | no | Related documentation --------------------- From 97192755c41093f29a3ca9a74e7fbbcf0f1f0285 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Wed, 14 Feb 2018 15:32:40 +0100 Subject: [PATCH 3/6] MON-79 Update storage successfull requests threshold --- cloud/azure/README.md | 4 ++-- cloud/azure/inputs.tf | 4 ++-- cloud/azure/storage/README.md | 4 ++-- cloud/azure/storage/inputs.tf | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cloud/azure/README.md b/cloud/azure/README.md index 2b5ca45..e481eab 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -104,8 +104,8 @@ Inputs | storage_network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `5` | no | | storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `50` | no | | storage_server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `10` | no | -| storage_successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `50` | no | -| storage_successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `90` | no | +| storage_successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `10` | no | +| storage_successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `30` | no | | storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `50` | no | | storage_throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `10` | no | | storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `50` | no | diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf index 941aa5a..0736061 100644 --- a/cloud/azure/inputs.tf +++ b/cloud/azure/inputs.tf @@ -322,12 +322,12 @@ variable "storage_availability_threshold_warning" { variable "storage_successful_requests_threshold_critical" { description = "Minimum acceptable percent of successful requests for a storage" - default = 50 + default = 10 } variable "storage_successful_requests_threshold_warning" { description = "Warning regarding acceptable percent of successful requests for a storage" - default = 90 + default = 30 } variable "storage_latency_threshold_critical" { diff --git a/cloud/azure/storage/README.md b/cloud/azure/storage/README.md index e5a93f6..6933ef1 100644 --- a/cloud/azure/storage/README.md +++ b/cloud/azure/storage/README.md @@ -49,8 +49,8 @@ Inputs | network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `5` | no | | server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `50` | no | | server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `10` | no | -| successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `50` | no | -| successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `90` | no | +| successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `10` | no | +| successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `30` | no | | throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `50` | no | | throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `10` | no | | timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `50` | no | diff --git a/cloud/azure/storage/inputs.tf b/cloud/azure/storage/inputs.tf index d210ca7..76a52a2 100644 --- a/cloud/azure/storage/inputs.tf +++ b/cloud/azure/storage/inputs.tf @@ -37,12 +37,12 @@ variable "availability_threshold_warning" { variable "successful_requests_threshold_critical" { description = "Minimum acceptable percent of successful requests for a storage" - default = 50 + default = 10 } variable "successful_requests_threshold_warning" { description = "Warning regarding acceptable percent of successful requests for a storage" - default = 90 + default = 30 } variable "latency_threshold_critical" { From fed0d592e93cf2619bdc380078b1cdd9147085a3 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Tue, 13 Feb 2018 15:34:31 +0100 Subject: [PATCH 4/6] MON-73 Raise all thresholds involving erroneous application behaviors to only handle possible infrastructure failures --- cloud/azure/README.md | 96 +++++++------- cloud/azure/apimanagement/README.md | 12 +- cloud/azure/apimanagement/inputs.tf | 28 ++++- .../monitors-azure-apimanagement.tf | 4 + cloud/azure/app-services/README.md | 20 +-- cloud/azure/app-services/inputs.tf | 26 ++-- cloud/azure/eventhub/README.md | 8 +- cloud/azure/eventhub/inputs.tf | 8 +- cloud/azure/inputs.tf | 118 ++++++++++-------- cloud/azure/iothubs/README.md | 34 ++--- cloud/azure/iothubs/inputs.tf | 32 ++--- cloud/azure/monitors.tf | 4 + cloud/azure/storage/README.md | 24 ++-- cloud/azure/storage/inputs.tf | 24 ++-- 14 files changed, 247 insertions(+), 191 deletions(-) diff --git a/cloud/azure/README.md b/cloud/azure/README.md index e481eab..e462bd5 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -31,46 +31,50 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| apimanagement_failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no | -| apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no | -| apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no | -| apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no | -| appservices_http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no | -| appservices_http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no | -| appservices_http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no | -| appservices_http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no | -| appservices_http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `90` | no | -| appservices_http_successful_requests_threshold_warning | Minimum warning acceptable percent of 2xx & 3xx requests | string | `95` | no | -| appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | -| appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | -| appservices_response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | -| appservices_response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | +| apimanagement_failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `90` | no | +| apimanagement_failed_requests_threshold_warning | Warning regarding acceptable percent of failed requests | string | `50` | no | +| apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `90` | no | +| apimanagement_other_requests_threshold_warning | Warning regarding acceptable percent of other requests | string | `50` | no | +| apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `10` | no | +| apimanagement_successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests | string | `30` | no | +| apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `90` | no | +| apimanagement_unauthorized_requests_threshold_warning | Warning regarding acceptable percent of unauthorized requests | string | `50` | no | +| appservices_http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `90` | no | +| appservices_http_4xx_requests_threshold_warning | Warning regarding acceptable percent of 4xx errors | string | `50` | no | +| appservices_http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `90` | no | +| appservices_http_5xx_requests_threshold_warning | Warning regarding acceptable percent of 5xx errors | string | `50` | no | +| appservices_http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `10` | no | +| appservices_http_successful_requests_threshold_warning | Warning regarding acceptable percent of 2xx & 3xx requests | string | `30` | no | +| appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `1073741824` | no | +| appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `536870912` | no | +| appservices_response_time_threshold_critical | Alerting threshold in seconds | string | `10` | no | +| appservices_response_time_threshold_warning | Warning threshold in seconds | string | `5` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| eventhub_errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no | -| eventhub_errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no | -| eventhub_failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no | -| eventhub_failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no | +| eventhub_errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `90` | no | +| eventhub_errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `50` | no | +| eventhub_failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `90` | no | +| eventhub_failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `50` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | iothub_dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no | | iothub_dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no | -| iothub_failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no | -| iothub_failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | -| iothub_failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | -| iothub_failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no | -| iothub_failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no | -| iothub_failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no | -| iothub_failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | -| iothub_failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no | | iothub_fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no | | iothub_fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no | | iothub_invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no | @@ -92,24 +96,24 @@ Inputs | sqldatabase_diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no | | sqldatabase_dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no | | sqldatabase_dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no | -| storage_authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `50` | no | -| storage_authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `15` | no | +| storage_authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `90` | no | +| storage_authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `50` | no | | storage_availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `50` | no | | storage_availability_threshold_warning | Warning regarding acceptable percent of availability for a storage | string | `90` | no | -| storage_client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `50` | no | -| storage_client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `15` | no | +| storage_client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `90` | no | +| storage_client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `50` | no | | storage_latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `2000` | no | | storage_latency_threshold_warning | Warning regarding acceptable end to end latency (ms) for a storage | string | `1000` | no | -| storage_network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `50` | no | -| storage_network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `5` | no | -| storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `50` | no | -| storage_server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `10` | no | +| storage_network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `90` | no | +| storage_network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `50` | no | +| storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `90` | no | +| storage_server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `50` | no | | storage_successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `10` | no | | storage_successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `30` | no | -| storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `50` | no | -| storage_throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `10` | no | -| storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `50` | no | -| storage_timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `5` | no | +| storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `90` | no | +| storage_throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `50` | no | +| storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `90` | no | +| storage_timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `50` | no | | streamanalytics_conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no | | streamanalytics_conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no | | streamanalytics_failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no | diff --git a/cloud/azure/apimanagement/README.md b/cloud/azure/apimanagement/README.md index e59e81a..3c94724 100644 --- a/cloud/azure/apimanagement/README.md +++ b/cloud/azure/apimanagement/README.md @@ -29,13 +29,17 @@ Inputs |------|-------------|:----:|:-----:|:-----:| | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no | +| failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `90` | no | +| failed_requests_threshold_warning | Warning regarding acceptable percent of failed requests | string | `50` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when a Redis monitor is triggered | string | - | yes | -| other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no | -| successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no | -| unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no | +| other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `90` | no | +| other_requests_threshold_warning | Warning regarding acceptable percent of other requests | string | `50` | no | +| successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `10` | no | +| successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests | string | `30` | no | +| unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `90` | no | +| unauthorized_requests_threshold_warning | Warning regarding acceptable percent of unauthorized requests | string | `50` | no | Related documentation --------------------- diff --git a/cloud/azure/apimanagement/inputs.tf b/cloud/azure/apimanagement/inputs.tf index 7d04b46..45e85fe 100644 --- a/cloud/azure/apimanagement/inputs.tf +++ b/cloud/azure/apimanagement/inputs.tf @@ -27,20 +27,40 @@ variable "filter_tags_custom" { # Azure API Management specific variable "failed_requests_threshold_critical" { description = "Maximum acceptable percent of failed requests" - default = 5 + default = 90 +} + +variable "failed_requests_threshold_warning" { + description = "Warning regarding acceptable percent of failed requests" + default = 50 } variable "other_requests_threshold_critical" { description = "Maximum acceptable percent of other requests" - default = 5 + default = 90 +} + +variable "other_requests_threshold_warning" { + description = "Warning regarding acceptable percent of other requests" + default = 50 } variable "unauthorized_requests_threshold_critical" { description = "Maximum acceptable percent of unauthorized requests" - default = 5 + default = 90 +} + +variable "unauthorized_requests_threshold_warning" { + description = "Warning regarding acceptable percent of unauthorized requests" + default = 50 } variable "successful_requests_threshold_critical" { description = "Minimum acceptable percent of successful requests" - default = 90 + default = 10 +} + +variable "successful_requests_threshold_warning" { + description = "Warning regarding acceptable percent of successful requests" + default = 30 } diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf index 394812b..90da0ab 100644 --- a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf +++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf @@ -49,6 +49,7 @@ resource "datadog_monitor" "apimgt_failed_requests" { thresholds { critical = "${var.failed_requests_threshold_critical}" + warning = "${var.failed_requests_threshold_warning}" } type = "metric alert" @@ -79,6 +80,7 @@ resource "datadog_monitor" "apimgt_other_requests" { thresholds { critical = "${var.other_requests_threshold_critical}" + warning = "${var.other_requests_threshold_warning}" } type = "metric alert" @@ -109,6 +111,7 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" { thresholds { critical = "${var.unauthorized_requests_threshold_critical}" + warning = "${var.unauthorized_requests_threshold_warning}" } type = "metric alert" @@ -139,6 +142,7 @@ resource "datadog_monitor" "apimgt_successful_requests" { thresholds { critical = "${var.successful_requests_threshold_critical}" + warning = "${var.successful_requests_threshold_warning}" } type = "metric alert" diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index 40eb946..e4d12c1 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -32,17 +32,17 @@ Inputs | environment | Architecture environment | string | - | yes | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | -| http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `90` | no | -| http_successful_requests_threshold_warning | Minimum warning acceptable percent of 2xx & 3xx requests | string | `95` | no | -| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no | -| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no | -| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no | -| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no | -| memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | -| memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | +| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `90` | no | +| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `50` | no | +| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `90` | no | +| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `50` | no | +| http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `10` | no | +| http_successful_requests_threshold_warning | Minimum warning acceptable percent of 2xx & 3xx requests | string | `30` | no | +| memory_usage_threshold_critical | Alerting threshold in Mib | string | `1073741824` | no | +| memory_usage_threshold_warning | Warning threshold in MiB | string | `536870912` | no | | message | Message sent when a monitor is triggered | string | - | yes | -| response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | -| response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | +| response_time_threshold_critical | Alerting threshold in seconds | string | `10` | no | +| response_time_threshold_warning | Warning threshold in seconds | string | `5` | no | Related documentation --------------------- diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index ab9ea74..c2de792 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -27,12 +27,12 @@ variable "delay" { ################################### variable "response_time_threshold_critical" { - default = 0.8 + default = 10 description = "Alerting threshold in seconds" } variable "response_time_threshold_warning" { - default = 0.4 + default = 5 description = "Warning threshold in seconds" } @@ -41,12 +41,12 @@ variable "response_time_threshold_warning" { ################################### variable "memory_usage_threshold_critical" { - default = 52430000 + default = 1073741824 # 1Gb description = "Alerting threshold in Mib" } variable "memory_usage_threshold_warning" { - default = 33550000 + default = 536870912 # 512Mb description = "Warning threshold in MiB" } @@ -55,13 +55,13 @@ variable "memory_usage_threshold_warning" { ################################# variable "http_5xx_requests_threshold_critical" { - default = 20 + default = 90 description = "Maximum critical acceptable percent of 5xx errors" } variable "http_5xx_requests_threshold_warning" { - default = 10 - description = "Maximum warning acceptable percent of 5xx errors" + default = 50 + description = "Warning regarding acceptable percent of 5xx errors" } ################################# @@ -69,13 +69,13 @@ variable "http_5xx_requests_threshold_warning" { ################################# variable "http_4xx_requests_threshold_critical" { - default = 30 + default = 90 description = "Maximum critical acceptable percent of 4xx errors" } variable "http_4xx_requests_threshold_warning" { - default = 15 - description = "Maximum warning acceptable percent of 4xx errors" + default = 50 + description = "Warning regarding acceptable percent of 4xx errors" } ################################# @@ -83,11 +83,11 @@ variable "http_4xx_requests_threshold_warning" { ################################# variable "http_successful_requests_threshold_critical" { - default = 90 + default = 10 description = "Minimum critical acceptable percent of 2xx & 3xx requests" } variable "http_successful_requests_threshold_warning" { - default = 95 - description = "Minimum warning acceptable percent of 2xx & 3xx requests" + default = 30 + description = "Warning regarding acceptable percent of 2xx & 3xx requests" } diff --git a/cloud/azure/eventhub/README.md b/cloud/azure/eventhub/README.md index b2573da..eff0d4c 100644 --- a/cloud/azure/eventhub/README.md +++ b/cloud/azure/eventhub/README.md @@ -29,10 +29,10 @@ Inputs |------|-------------|:----:|:-----:|:-----:| | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no | -| errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no | -| failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no | -| failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no | +| errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `90` | no | +| errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `50` | no | +| failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `90` | no | +| failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `50` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when an alert is triggered | string | - | yes | diff --git a/cloud/azure/eventhub/inputs.tf b/cloud/azure/eventhub/inputs.tf index 5cf007a..573d6f7 100644 --- a/cloud/azure/eventhub/inputs.tf +++ b/cloud/azure/eventhub/inputs.tf @@ -26,20 +26,20 @@ variable "filter_tags_custom" { variable "failed_requests_rate_thresold_critical" { description = "Failed requests ratio (percentage) to trigger the critical alert" - default = 3 + default = 90 } variable "failed_requests_rate_thresold_warning" { description = "Failed requests ratio (percentage) to trigger a warning alert" - default = 1 + default = 50 } variable "errors_rate_thresold_critical" { description = "Errors ratio (percentage) to trigger the critical alert" - default = 3 + default = 90 } variable "errors_rate_thresold_warning" { description = "Errors ratio (percentage) to trigger a warning alert" - default = 1 + default = 50 } diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf index 0736061..ae37575 100644 --- a/cloud/azure/inputs.tf +++ b/cloud/azure/inputs.tf @@ -31,175 +31,195 @@ variable "non_taggable_filter_tags" { # Azure API Management specific variables variable "apimanagement_failed_requests_threshold_critical" { description = "Maximum acceptable percent of failed requests" - default = 5 + default = 90 +} + +variable "apimanagement_failed_requests_threshold_warning" { + description = "Warning regarding acceptable percent of failed requests" + default = 50 } variable "apimanagement_other_requests_threshold_critical" { description = "Maximum acceptable percent of other requests" - default = 5 + default = 90 +} + +variable "apimanagement_other_requests_threshold_warning" { + description = "Warning regarding acceptable percent of other requests" + default = 50 } variable "apimanagement_unauthorized_requests_threshold_critical" { description = "Maximum acceptable percent of unauthorized requests" - default = 5 + default = 90 +} + +variable "apimanagement_unauthorized_requests_threshold_warning" { + description = "Warning regarding acceptable percent of unauthorized requests" + default = 50 } variable "apimanagement_successful_requests_threshold_critical" { description = "Minimum acceptable percent of successful requests" - default = 90 + default = 10 +} + +variable "apimanagement_successful_requests_threshold_warning" { + description = "Warning regarding acceptable percent of successful requests" + default = 30 } # Azure App Services specific variables variable "appservices_response_time_threshold_critical" { - default = 0.8 + default = 10 description = "Alerting threshold in seconds" } variable "appservices_response_time_threshold_warning" { - default = 0.4 + default = 5 description = "Warning threshold in seconds" } variable "appservices_memory_usage_threshold_critical" { - default = 52430000 + default = 1073741824 # 1Gb description = "Alerting threshold in Mib" } variable "appservices_memory_usage_threshold_warning" { - default = 33550000 + default = 536870912 # 512Mb description = "Warning threshold in MiB" } variable "appservices_http_4xx_requests_threshold_critical" { - default = 30 + default = 90 description = "Maximum critical acceptable percent of 4xx errors" } variable "appservices_http_4xx_requests_threshold_warning" { - default = 15 - description = "Maximum warning acceptable percent of 4xx errors" + default = 50 + description = "Warning regarding acceptable percent of 4xx errors" } variable "appservices_http_5xx_requests_threshold_critical" { - default = 20 + default = 90 description = "Maximum critical acceptable percent of 5xx errors" } variable "appservices_http_5xx_requests_threshold_warning" { - default = 10 - description = "Maximum warning acceptable percent of 5xx errors" + default = 50 + description = "Warning regarding acceptable percent of 5xx errors" } variable "appservices_http_successful_requests_threshold_critical" { - default = 90 + default = 10 description = "Minimum critical acceptable percent of 2xx & 3xx requests" } variable "appservices_http_successful_requests_threshold_warning" { - default = 95 - description = "Minimum warning acceptable percent of 2xx & 3xx requests" + default = 30 + description = "Warning regarding acceptable percent of 2xx & 3xx requests" } # Azure Event Hub specific variables variable "eventhub_failed_requests_rate_thresold_critical" { description = "Failed requests ratio (percentage) to trigger the critical alert" - default = 3 + default = 90 } variable "eventhub_failed_requests_rate_thresold_warning" { description = "Failed requests ratio (percentage) to trigger a warning alert" - default = 1 + default = 50 } variable "eventhub_errors_rate_thresold_critical" { description = "Errors ratio (percentage) to trigger the critical alert" - default = 3 + default = 90 } variable "eventhub_errors_rate_thresold_warning" { description = "Errors ratio (percentage) to trigger a warning alert" - default = 1 + default = 50 } # IOT Hub specific variables variable "iothub_failed_jobs_rate_threshold_warning" { description = "Jobs Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "iothub_failed_jobs_rate_threshold_critical" { description = "Jobs Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "iothub_failed_listjobs_rate_threshold_warning" { description = "ListJobs Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "iothub_failed_listjobs_rate_threshold_critical" { description = "ListJobs Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "iothub_failed_queryjobs_rate_threshold_warning" { description = "QueryJobs Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "iothub_failed_queryjobs_rate_threshold_critical" { description = "QueryJobs Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "iothub_failed_c2d_methods_rate_threshold_warning" { description = "C2D Methods Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "iothub_failed_c2d_methods_rate_threshold_critical" { description = "C2D Methods Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "iothub_failed_c2d_twin_read_rate_threshold_warning" { description = "C2D Twin Read Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "iothub_failed_c2d_twin_read_rate_threshold_critical" { description = "C2D Twin Read Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "iothub_failed_c2d_twin_update_rate_threshold_warning" { description = "C2D Twin Update Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "iothub_failed_c2d_twin_update_rate_threshold_critical" { description = "C2D Twin Update Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "iothub_failed_d2c_twin_read_rate_threshold_warning" { description = "D2C Twin Read Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "iothub_failed_d2c_twin_read_rate_threshold_critical" { description = "D2C Twin Read Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "iothub_failed_d2c_twin_update_rate_threshold_warning" { description = "D2C Twin Update Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "iothub_failed_d2c_twin_update_rate_threshold_critical" { description = "D2C Twin Update Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "iothub_dropped_d2c_telemetry_egress_threshold_warning" { @@ -342,62 +362,62 @@ variable "storage_latency_threshold_warning" { variable "storage_timeout_error_requests_threshold_critical" { description = "Maximum acceptable percent of timeout error requests for a storage" - default = 50 + default = 90 } variable "storage_timeout_error_requests_threshold_warning" { description = "Warning regarding acceptable percent of timeout error requests for a storage" - default = 5 + default = 50 } variable "storage_network_error_requests_threshold_critical" { description = "Maximum acceptable percent of network error requests for a storage" - default = 50 + default = 90 } variable "storage_network_error_requests_threshold_warning" { description = "Warning regarding acceptable percent of network error requests for a storage" - default = 5 + default = 50 } variable "storage_throttling_error_requests_threshold_critical" { description = "Maximum acceptable percent of throttling error requests for a storage" - default = 50 + default = 90 } variable "storage_throttling_error_requests_threshold_warning" { description = "Warning regarding acceptable percent of throttling error requests for a storage" - default = 10 + default = 50 } variable "storage_server_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of server other error requests for a storage" - default = 50 + default = 90 } variable "storage_server_other_error_requests_threshold_warning" { description = "Warning regarding acceptable percent of server other error requests for a storage" - default = 10 + default = 50 } variable "storage_client_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of client other error requests for a storage" - default = 50 + default = 90 } variable "storage_client_other_error_requests_threshold_warning" { description = "Warning regarding acceptable percent of client other error requests for a storage" - default = 15 + default = 50 } variable "storage_authorization_error_requests_threshold_critical" { description = "Maximum acceptable percent of authorization error requests for a storage" - default = 50 + default = 90 } variable "storage_authorization_error_requests_threshold_warning" { description = "Warning regarding acceptable percent of authorization error requests for a storage" - default = 15 + default = 50 } # Azure Stream Analytics specific variables diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md index e594a65..4b29bdc 100644 --- a/cloud/azure/iothubs/README.md +++ b/cloud/azure/iothubs/README.md @@ -42,25 +42,25 @@ Inputs | dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no | | dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no | | environment | Architecture Environment | string | - | yes | -| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no | -| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no | -| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no | -| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | -| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | -| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | -| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no | -| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no | -| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no | -| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no | -| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no | -| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no | -| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | -| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | -| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | -| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | +| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `90` | no | +| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `50` | no | +| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `90` | no | +| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `50` | no | +| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `90` | no | +| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `50` | no | +| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `90` | no | +| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `50` | no | +| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `90` | no | +| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `50` | no | +| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `90` | no | +| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `50` | no | +| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `90` | no | +| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `50` | no | +| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `90` | no | +| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no | | fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no | | fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no | -| filter_tags | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags | Tags used for filtering | string | `*` | no | | invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no | | invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no | | message | Message sent when an alert is triggered | string | - | yes | diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 1eb0d0d..f9f1844 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -22,82 +22,82 @@ variable "filter_tags" { # Azure IOT hubs specific variable "failed_jobs_rate_threshold_warning" { description = "Jobs Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_jobs_rate_threshold_critical" { description = "Jobs Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "failed_listjobs_rate_threshold_warning" { description = "ListJobs Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_listjobs_rate_threshold_critical" { description = "ListJobs Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "failed_queryjobs_rate_threshold_warning" { description = "QueryJobs Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_queryjobs_rate_threshold_critical" { description = "QueryJobs Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "failed_c2d_methods_rate_threshold_warning" { description = "C2D Methods Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_c2d_methods_rate_threshold_critical" { description = "C2D Methods Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "failed_c2d_twin_read_rate_threshold_warning" { description = "C2D Twin Read Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_c2d_twin_read_rate_threshold_critical" { description = "C2D Twin Read Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "failed_c2d_twin_update_rate_threshold_warning" { description = "C2D Twin Update Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_c2d_twin_update_rate_threshold_critical" { description = "C2D Twin Update Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "failed_d2c_twin_read_rate_threshold_warning" { description = "D2C Twin Read Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_d2c_twin_read_rate_threshold_critical" { description = "D2C Twin Read Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "failed_d2c_twin_update_rate_threshold_warning" { description = "D2C Twin Update Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_d2c_twin_update_rate_threshold_critical" { description = "D2C Twin Update Failed rate limit (critical threshold)" - default = 10 + default = 90 } variable "dropped_d2c_telemetry_egress_threshold_warning" { diff --git a/cloud/azure/monitors.tf b/cloud/azure/monitors.tf index 650d3fd..073cab9 100644 --- a/cloud/azure/monitors.tf +++ b/cloud/azure/monitors.tf @@ -9,9 +9,13 @@ module "apimanagement" { filter_tags_custom = "${var.filter_tags_custom}" failed_requests_threshold_critical = "${var.apimanagement_failed_requests_threshold_critical}" + failed_requests_threshold_warning = "${var.apimanagement_failed_requests_threshold_warning}" other_requests_threshold_critical = "${var.apimanagement_other_requests_threshold_critical}" + other_requests_threshold_warning = "${var.apimanagement_other_requests_threshold_warning}" successful_requests_threshold_critical = "${var.apimanagement_successful_requests_threshold_critical}" + successful_requests_threshold_warning = "${var.apimanagement_successful_requests_threshold_warning}" unauthorized_requests_threshold_critical = "${var.apimanagement_unauthorized_requests_threshold_critical}" + unauthorized_requests_threshold_warning = "${var.apimanagement_unauthorized_requests_threshold_warning}" } module "appservices" { diff --git a/cloud/azure/storage/README.md b/cloud/azure/storage/README.md index 6933ef1..7d72473 100644 --- a/cloud/azure/storage/README.md +++ b/cloud/azure/storage/README.md @@ -32,12 +32,12 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `50` | no | -| authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `15` | no | +| authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `90` | no | +| authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `50` | no | | availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `50` | no | | availability_threshold_warning | Warning regarding acceptable percent of availability for a storage | string | `90` | no | -| client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `50` | no | -| client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `15` | no | +| client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `90` | no | +| client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `50` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | @@ -45,16 +45,16 @@ Inputs | latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `2000` | no | | latency_threshold_warning | Warning regarding acceptable end to end latency (ms) for a storage | string | `1000` | no | | message | Message sent when a Redis monitor is triggered | string | - | yes | -| network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `50` | no | -| network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `5` | no | -| server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `50` | no | -| server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `10` | no | +| network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `90` | no | +| network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `50` | no | +| server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `90` | no | +| server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `50` | no | | successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `10` | no | | successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `30` | no | -| throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `50` | no | -| throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `10` | no | -| timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `50` | no | -| timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `5` | no | +| throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `90` | no | +| throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `50` | no | +| timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `90` | no | +| timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `50` | no | Related documentation --------------------- diff --git a/cloud/azure/storage/inputs.tf b/cloud/azure/storage/inputs.tf index 76a52a2..83468fd 100644 --- a/cloud/azure/storage/inputs.tf +++ b/cloud/azure/storage/inputs.tf @@ -57,60 +57,60 @@ variable "latency_threshold_warning" { variable "timeout_error_requests_threshold_critical" { description = "Maximum acceptable percent of timeout error requests for a storage" - default = 50 + default = 90 } variable "timeout_error_requests_threshold_warning" { description = "Warning regarding acceptable percent of timeout error requests for a storage" - default = 5 + default = 50 } variable "network_error_requests_threshold_critical" { description = "Maximum acceptable percent of network error requests for a storage" - default = 50 + default = 90 } variable "network_error_requests_threshold_warning" { description = "Warning regarding acceptable percent of network error requests for a storage" - default = 5 + default = 50 } variable "throttling_error_requests_threshold_critical" { description = "Maximum acceptable percent of throttling error requests for a storage" - default = 50 + default = 90 } variable "throttling_error_requests_threshold_warning" { description = "Warning regarding acceptable percent of throttling error requests for a storage" - default = 10 + default = 50 } variable "server_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of server other error requests for a storage" - default = 50 + default = 90 } variable "server_other_error_requests_threshold_warning" { description = "Warning regarding acceptable percent of server other error requests for a storage" - default = 10 + default = 50 } variable "client_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of client other error requests for a storage" - default = 50 + default = 90 } variable "client_other_error_requests_threshold_warning" { description = "Warning regarding acceptable percent of client other error requests for a storage" - default = 15 + default = 50 } variable "authorization_error_requests_threshold_critical" { description = "Maximum acceptable percent of authorization error requests for a storage" - default = 50 + default = 90 } variable "authorization_error_requests_threshold_warning" { description = "Warning regarding acceptable percent of authorization error requests for a storage" - default = 15 + default = 50 } From e43f07d913bdd6f9c24f84101ea1360e0793f437 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Tue, 13 Feb 2018 16:49:37 +0100 Subject: [PATCH 5/6] MON-73 Best practices : rename Azure monitors and set require full window to false --- .../monitors-azure-apimanagement.tf | 20 +++---- .../app-services/monitors-app_services.tf | 24 ++++---- cloud/azure/eventhub/monitors-eventhub.tf | 12 ++-- cloud/azure/iothubs/monitors-iothubs.tf | 60 +++++++++---------- cloud/azure/redis/monitors-azure-redis.tf | 14 ++--- .../monitors-sql-database-basics.tf | 16 ++--- cloud/azure/storage/monitors-azure-storage.tf | 36 +++++------ .../monitors-stream-analytics.tf | 20 +++---- 8 files changed, 101 insertions(+), 101 deletions(-) diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf index 90da0ab..23fe090 100644 --- a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf +++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf @@ -9,7 +9,7 @@ data "template_file" "filter" { } resource "datadog_monitor" "apimgt_status" { - name = "[${var.environment}] API Management status is not ok on {{name}}" + name = "[${var.environment}] API Management is down" message = "${var.message}" query = < Date: Thu, 15 Feb 2018 17:21:03 +0100 Subject: [PATCH 6/6] MON-73 Add mute inputs for each Azure monitors --- cloud/azure/README.md | 55 +++- cloud/azure/apimanagement/README.md | 5 + cloud/azure/apimanagement/inputs.tf | 30 ++ .../monitors-azure-apimanagement.tf | 10 + cloud/azure/app-services/README.md | 15 +- cloud/azure/app-services/inputs.tf | 64 ++-- .../app-services/monitors-app_services.tf | 10 + cloud/azure/eventhub/README.md | 12 +- cloud/azure/eventhub/inputs.tf | 18 ++ cloud/azure/eventhub/monitors-eventhub.tf | 6 + cloud/azure/eventhub/outputs.tf | 11 - cloud/azure/inputs.tf | 306 +++++++++++++++++- cloud/azure/iothubs/README.md | 15 + cloud/azure/iothubs/inputs.tf | 90 ++++++ cloud/azure/iothubs/monitors-iothubs.tf | 30 ++ cloud/azure/monitors.tf | 51 ++- cloud/azure/redis/README.md | 3 + cloud/azure/redis/inputs.tf | 24 ++ cloud/azure/redis/monitors-azure-redis.tf | 8 + cloud/azure/sql-database/README.md | 4 + cloud/azure/sql-database/inputs.tf | 23 ++ .../monitors-sql-database-basics.tf | 8 + cloud/azure/storage/README.md | 9 + cloud/azure/storage/inputs.tf | 54 ++++ cloud/azure/storage/monitors-azure-storage.tf | 18 ++ cloud/azure/stream-analytics/README.md | 7 +- cloud/azure/stream-analytics/inputs.tf | 32 +- .../monitors-stream-analytics.tf | 12 +- 28 files changed, 868 insertions(+), 62 deletions(-) delete mode 100644 cloud/azure/eventhub/outputs.tf diff --git a/cloud/azure/README.md b/cloud/azure/README.md index e462bd5..3e65051 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -31,95 +31,144 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| +| apimanagement_failed_requests_silenced | Groups to mute for API Management failed requests monitor | map | `` | no | | apimanagement_failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `90` | no | | apimanagement_failed_requests_threshold_warning | Warning regarding acceptable percent of failed requests | string | `50` | no | +| apimanagement_other_requests_silenced | Groups to mute for API Management other requests monitor | map | `` | no | | apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `90` | no | | apimanagement_other_requests_threshold_warning | Warning regarding acceptable percent of other requests | string | `50` | no | +| apimanagement_status_silenced | Groups to mute for API Management status monitor | map | `` | no | +| apimanagement_successful_requests_silenced | Groups to mute for API Management successful requests monitor | map | `` | no | | apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `10` | no | | apimanagement_successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests | string | `30` | no | +| apimanagement_unauthorized_requests_silenced | Groups to mute for API Management unauthorized requests monitor | map | `` | no | | apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `90` | no | | apimanagement_unauthorized_requests_threshold_warning | Warning regarding acceptable percent of unauthorized requests | string | `50` | no | +| appservices_http_4xx_requests_silenced | Groups to mute for App Services 4xx requests monitor | map | `` | no | | appservices_http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `90` | no | | appservices_http_4xx_requests_threshold_warning | Warning regarding acceptable percent of 4xx errors | string | `50` | no | +| appservices_http_5xx_requests_silenced | Groups to mute for App Services 5xx requests monitor | map | `` | no | | appservices_http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `90` | no | | appservices_http_5xx_requests_threshold_warning | Warning regarding acceptable percent of 5xx errors | string | `50` | no | +| appservices_http_successful_requests_silenced | Groups to mute for App Services successful requests monitor | map | `` | no | | appservices_http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `10` | no | | appservices_http_successful_requests_threshold_warning | Warning regarding acceptable percent of 2xx & 3xx requests | string | `30` | no | +| appservices_memory_usage_silenced | Groups to mute for App Services memory usage monitor | map | `` | no | | appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `1073741824` | no | | appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `536870912` | no | -| appservices_response_time_threshold_critical | Alerting threshold in seconds | string | `10` | no | -| appservices_response_time_threshold_warning | Warning threshold in seconds | string | `5` | no | +| appservices_response_time_silenced | Groups to mute for App Services response time monitor | map | `` | no | +| appservices_response_time_threshold_critical | Alerting threshold for response time in seconds | string | `10` | no | +| appservices_response_time_threshold_warning | Warning threshold for response time in seconds | string | `5` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | +| eventhub_errors_rate_silenced | Groups to mute for Event Hub errors monitor | map | `` | no | | eventhub_errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `90` | no | | eventhub_errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `50` | no | +| eventhub_failed_requests_rate_silenced | Groups to mute for Event Hub failed requests monitor | map | `` | no | | eventhub_failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `90` | no | | eventhub_failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `50` | no | +| eventhub_status_silenced | Groups to mute for Event Hub status monitor | map | `` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| iothub_dropped_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub dropped d2c telemetry monitor | map | `` | no | | iothub_dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no | | iothub_dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no | +| iothub_failed_c2d_methods_rate_silenced | Groups to mute for IoT Hub failed c2d methods monitor | map | `` | no | | iothub_failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `90` | no | | iothub_failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_c2d_twin_read_rate_silenced | Groups to mute for IoT Hub failed c2d twin read monitor | map | `` | no | | iothub_failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `90` | no | | iothub_failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_c2d_twin_update_rate_silenced | Groups to mute for IoT Hub failed c2d twin update monitor | map | `` | no | | iothub_failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `90` | no | | iothub_failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_d2c_twin_read_rate_silenced | Groups to mute for IoT Hub failed d2c twin read monitor | map | `` | no | | iothub_failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `90` | no | | iothub_failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_d2c_twin_update_rate_silenced | Groups to mute for IoT Hub failed d2c twin update monitor | map | `` | no | | iothub_failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `90` | no | | iothub_failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_jobs_rate_silenced | Groups to mute for IoT Hub failed jobs monitor | map | `` | no | | iothub_failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `90` | no | | iothub_failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_listjobs_rate_silenced | Groups to mute for IoT Hub failed list jobs monitor | map | `` | no | | iothub_failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `90` | no | | iothub_failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_queryjobs_rate_silenced | Groups to mute for IoT Hub failed query jobs monitor | map | `` | no | | iothub_failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `90` | no | | iothub_failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no | +| iothub_fallback_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub fallback d2c telemetry monitor | map | `` | no | | iothub_fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no | | iothub_fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no | +| iothub_invalid_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub invalid d2c telemetry monitor | map | `` | no | | iothub_invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no | | iothub_invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no | +| iothub_orphaned_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub orphaned d2c telemetry monitor | map | `` | no | | iothub_orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no | | iothub_orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no | +| iothub_status_silenced | Groups to mute for IoT Hub status monitor | map | `` | no | +| iothub_too_many_d2c_telemetry_ingress_nosent_silenced | Groups to mute for IoT Hub unsent d2c telemetry monitor | map | `` | no | +| iothub_total_devices_silenced | Groups to mute for IoT Hub total device monitor | map | `` | no | | message | Message sent when a monitor is triggered | string | - | yes | | non_taggable_filter_tags | Tags used for filtering for components without tag support | string | `*` | no | +| redis_evictedkeys_limit_silenced | Groups to mute for Redis evicted keys monitor | map | `` | no | | redis_evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no | | redis_evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no | +| redis_percent_processor_time_silenced | Groups to mute for Redis processor monitor | map | `` | no | | redis_percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no | | redis_percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no | +| redis_server_load_rate_silenced | Groups to mute for Redis server load monitor | map | `` | no | | redis_server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no | | redis_server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no | +| sqldatabase_cpu_silenced | Groups to mute for SQL CPU monitor | map | `` | no | | sqldatabase_cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | | sqldatabase_cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no | +| sqldatabase_deadlock_silenced | Groups to mute for SQL Deadlock monitor | map | `` | no | | sqldatabase_deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no | +| sqldatabase_diskspace_silenced | Groups to mute for SQL disk space monitor | map | `` | no | | sqldatabase_diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no | | sqldatabase_diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no | +| sqldatabase_dtu_silenced | Groups to mute for SQL DTU monitor | map | `` | no | | sqldatabase_dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no | | sqldatabase_dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no | +| storage_authorization_error_requests_silenced | Groups to mute for Storage authorization errors monitor | map | `` | no | | storage_authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `90` | no | | storage_authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `50` | no | +| storage_availability_silenced | Groups to mute for Storage availability monitor | map | `` | no | | storage_availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `50` | no | | storage_availability_threshold_warning | Warning regarding acceptable percent of availability for a storage | string | `90` | no | +| storage_client_other_error_requests_silenced | Groups to mute for Storage other errors monitor | map | `` | no | | storage_client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `90` | no | | storage_client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `50` | no | +| storage_latency_silenced | Groups to mute for Storage latency monitor | map | `` | no | | storage_latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `2000` | no | | storage_latency_threshold_warning | Warning regarding acceptable end to end latency (ms) for a storage | string | `1000` | no | +| storage_network_error_requests_silenced | Groups to mute for Storage network errors monitor | map | `` | no | | storage_network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `90` | no | | storage_network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `50` | no | +| storage_server_other_error_requests_silenced | Groups to mute for Storage server other errors monitor | map | `` | no | | storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `90` | no | | storage_server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `50` | no | +| storage_successful_requests_silenced | Groups to mute for Storage sucessful requests monitor | map | `` | no | | storage_successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `10` | no | | storage_successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `30` | no | +| storage_throttling_error_requests_silenced | Groups to mute for Storage throttling error monitor | map | `` | no | | storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `90` | no | | storage_throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `50` | no | +| storage_timeout_error_requests_silenced | Groups to mute for Storage timeout monitor | map | `` | no | | storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `90` | no | | storage_timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `50` | no | +| streamanalytics_conversion_errors_silenced | Groups to mute for Stream Analytics conversion errors monitor | map | `` | no | | streamanalytics_conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no | | streamanalytics_conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no | +| streamanalytics_failed_function_requests_silenced | Groups to mute for Stream Analytics failed requests monitor | map | `` | no | | streamanalytics_failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no | -| streamanalytics_function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | +| streamanalytics_failed_function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | +| streamanalytics_runtime_errors_silenced | Groups to mute for Stream Analytics runtime errors monitor | map | `` | no | | streamanalytics_runtime_errors_threshold_critical | Runtime errors limit (critical threshold) | string | `10` | no | | streamanalytics_runtime_errors_threshold_warning | Runtime errors limit (warning threshold) | string | `0` | no | +| streamanalytics_status_silenced | Groups to mute for Stream Analytics status monitor | map | `` | no | +| streamanalytics_su_utilization_silenced | Groups to mute for Stream Analytics utilization monitor | map | `` | no | | streamanalytics_su_utilization_threshold_critical | Streaming Unit utilization rate limit (critical threshold) | string | `80` | no | | streamanalytics_su_utilization_threshold_warning | Streaming Unit utilization rate limit (warning threshold) | string | `60` | no | diff --git a/cloud/azure/apimanagement/README.md b/cloud/azure/apimanagement/README.md index 3c94724..82de287 100644 --- a/cloud/azure/apimanagement/README.md +++ b/cloud/azure/apimanagement/README.md @@ -29,15 +29,20 @@ Inputs |------|-------------|:----:|:-----:|:-----:| | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | +| failed_requests_silenced | Groups to mute for API Management failed requests monitor | map | `` | no | | failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `90` | no | | failed_requests_threshold_warning | Warning regarding acceptable percent of failed requests | string | `50` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when a Redis monitor is triggered | string | - | yes | +| other_requests_silenced | Groups to mute for API Management other requests monitor | map | `` | no | | other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `90` | no | | other_requests_threshold_warning | Warning regarding acceptable percent of other requests | string | `50` | no | +| status_silenced | Groups to mute for API Management status monitor | map | `` | no | +| successful_requests_silenced | Groups to mute for API Management successful requests monitor | map | `` | no | | successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `10` | no | | successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests | string | `30` | no | +| unauthorized_requests_silenced | Groups to mute for API Management unauthorized requests monitor | map | `` | no | | unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `90` | no | | unauthorized_requests_threshold_warning | Warning regarding acceptable percent of unauthorized requests | string | `50` | no | diff --git a/cloud/azure/apimanagement/inputs.tf b/cloud/azure/apimanagement/inputs.tf index 45e85fe..74273b2 100644 --- a/cloud/azure/apimanagement/inputs.tf +++ b/cloud/azure/apimanagement/inputs.tf @@ -25,6 +25,18 @@ variable "filter_tags_custom" { } # Azure API Management specific +variable "status_silenced" { + description = "Groups to mute for API Management status monitor" + type = "map" + default = {} +} + +variable "failed_requests_silenced" { + description = "Groups to mute for API Management failed requests monitor" + type = "map" + default = {} +} + variable "failed_requests_threshold_critical" { description = "Maximum acceptable percent of failed requests" default = 90 @@ -35,6 +47,12 @@ variable "failed_requests_threshold_warning" { default = 50 } +variable "other_requests_silenced" { + description = "Groups to mute for API Management other requests monitor" + type = "map" + default = {} +} + variable "other_requests_threshold_critical" { description = "Maximum acceptable percent of other requests" default = 90 @@ -45,6 +63,12 @@ variable "other_requests_threshold_warning" { default = 50 } +variable "unauthorized_requests_silenced" { + description = "Groups to mute for API Management unauthorized requests monitor" + type = "map" + default = {} +} + variable "unauthorized_requests_threshold_critical" { description = "Maximum acceptable percent of unauthorized requests" default = 90 @@ -55,6 +79,12 @@ variable "unauthorized_requests_threshold_warning" { default = 50 } +variable "successful_requests_silenced" { + description = "Groups to mute for API Management successful requests monitor" + type = "map" + default = {} +} + variable "successful_requests_threshold_critical" { description = "Minimum acceptable percent of successful requests" default = 10 diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf index 23fe090..5a17080 100644 --- a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf +++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf @@ -22,6 +22,8 @@ resource "datadog_monitor" "apimgt_status" { critical = 1 } + silenced = "${var.status_silenced}" + notify_no_data = true evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -52,6 +54,8 @@ resource "datadog_monitor" "apimgt_failed_requests" { warning = "${var.failed_requests_threshold_warning}" } + silenced = "${var.failed_requests_silenced}" + type = "metric alert" notify_no_data = false notify_audit = false @@ -83,6 +87,8 @@ resource "datadog_monitor" "apimgt_other_requests" { warning = "${var.other_requests_threshold_warning}" } + silenced = "${var.other_requests_silenced}" + type = "metric alert" notify_no_data = false notify_audit = false @@ -114,6 +120,8 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" { warning = "${var.unauthorized_requests_threshold_warning}" } + silenced = "${var.unauthorized_requests_silenced}" + type = "metric alert" notify_no_data = false notify_audit = false @@ -145,6 +153,8 @@ resource "datadog_monitor" "apimgt_successful_requests" { warning = "${var.successful_requests_threshold_warning}" } + silenced = "${var.successful_requests_silenced}" + type = "metric alert" notify_no_data = false notify_audit = false diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index e4d12c1..1304dbc 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -32,17 +32,22 @@ Inputs | environment | Architecture environment | string | - | yes | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| http_4xx_requests_silenced | Groups to mute for App Services 4xx requests monitor | map | `` | no | | http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `90` | no | -| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `50` | no | +| http_4xx_requests_threshold_warning | Warning regarding acceptable percent of 4xx errors | string | `50` | no | +| http_5xx_requests_silenced | Groups to mute for App Services 5xx requests monitor | map | `` | no | | http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `90` | no | -| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `50` | no | +| http_5xx_requests_threshold_warning | Warning regarding acceptable percent of 5xx errors | string | `50` | no | +| http_successful_requests_silenced | Groups to mute for App Services successful requests monitor | map | `` | no | | http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `10` | no | -| http_successful_requests_threshold_warning | Minimum warning acceptable percent of 2xx & 3xx requests | string | `30` | no | +| http_successful_requests_threshold_warning | Warning regarding acceptable percent of 2xx & 3xx requests | string | `30` | no | +| memory_usage_silenced | Groups to mute for App Services memory usage monitor | map | `` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `1073741824` | no | | memory_usage_threshold_warning | Warning threshold in MiB | string | `536870912` | no | | message | Message sent when a monitor is triggered | string | - | yes | -| response_time_threshold_critical | Alerting threshold in seconds | string | `10` | no | -| response_time_threshold_warning | Warning threshold in seconds | string | `5` | no | +| response_time_silenced | Groups to mute for App Services response time monitor | map | `` | no | +| response_time_threshold_critical | Alerting threshold for response time in seconds | string | `10` | no | +| response_time_threshold_warning | Warning threshold for response time in seconds | string | `5` | no | Related documentation --------------------- diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index c2de792..8ed4216 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -22,23 +22,27 @@ variable "delay" { default = 600 } -################################### -### RESPONSE TIME VARIABLES ### -################################### +variable "response_time_silenced" { + description = "Groups to mute for App Services response time monitor" + type = "map" + default = {} +} variable "response_time_threshold_critical" { default = 10 - description = "Alerting threshold in seconds" + description = "Alerting threshold for response time in seconds" } variable "response_time_threshold_warning" { default = 5 - description = "Warning threshold in seconds" + description = "Warning threshold for response time in seconds" } -################################### -### MEMORY USAGE VARIABLES ### -################################### +variable "memory_usage_silenced" { + description = "Groups to mute for App Services memory usage monitor" + type = "map" + default = {} +} variable "memory_usage_threshold_critical" { default = 1073741824 # 1Gb @@ -50,24 +54,12 @@ variable "memory_usage_threshold_warning" { description = "Warning threshold in MiB" } -################################# -### HTTP 5xx status pages ### -################################# - -variable "http_5xx_requests_threshold_critical" { - default = 90 - description = "Maximum critical acceptable percent of 5xx errors" +variable "http_4xx_requests_silenced" { + description = "Groups to mute for App Services 4xx requests monitor" + type = "map" + default = {} } -variable "http_5xx_requests_threshold_warning" { - default = 50 - description = "Warning regarding acceptable percent of 5xx errors" -} - -################################# -### HTTP 4xx status pages ### -################################# - variable "http_4xx_requests_threshold_critical" { default = 90 description = "Maximum critical acceptable percent of 4xx errors" @@ -78,9 +70,27 @@ variable "http_4xx_requests_threshold_warning" { description = "Warning regarding acceptable percent of 4xx errors" } -################################# -### HTTP 2xx status pages ### -################################# +variable "http_5xx_requests_silenced" { + description = "Groups to mute for App Services 5xx requests monitor" + type = "map" + default = {} +} + +variable "http_5xx_requests_threshold_critical" { + default = 90 + description = "Maximum critical acceptable percent of 5xx errors" +} + +variable "http_5xx_requests_threshold_warning" { + default = 50 + description = "Warning regarding acceptable percent of 5xx errors" +} + +variable "http_successful_requests_silenced" { + description = "Groups to mute for App Services successful requests monitor" + type = "map" + default = {} +} variable "http_successful_requests_threshold_critical" { default = 10 diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index a5a7824..62a68a8 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -26,6 +26,8 @@ resource "datadog_monitor" "appservices_response_time" { critical = "${var.response_time_threshold_critical}" } + silenced = "${var.response_time_silenced}" + notify_no_data = true # Will notify when no data is received renotify_interval = 0 require_full_window = false @@ -55,6 +57,8 @@ resource "datadog_monitor" "appservices_memory_usage_count" { critical = "${var.memory_usage_threshold_critical}" } + silenced = "${var.memory_usage_silenced}" + notify_no_data = true # Will notify when no data is received renotify_interval = 0 require_full_window = false @@ -85,6 +89,8 @@ resource "datadog_monitor" "appservices_http_5xx_errors_count" { critical = "${var.http_5xx_requests_threshold_critical}" } + silenced = "${var.http_5xx_requests_silenced}" + notify_no_data = false # Will NOT notify when no data is received renotify_interval = 0 require_full_window = false @@ -115,6 +121,8 @@ resource "datadog_monitor" "appservices_http_4xx_errors_count" { critical = "${var.http_4xx_requests_threshold_critical}" } + silenced = "${var.http_4xx_requests_silenced}" + notify_no_data = false # Will NOT notify when no data is received renotify_interval = 0 require_full_window = false @@ -146,6 +154,8 @@ resource "datadog_monitor" "appservices_http_success_status_rate" { critical = "${var.http_successful_requests_threshold_critical}" } + silenced = "${var.http_successful_requests_silenced}" + notify_no_data = false # Will notify when no data is received renotify_interval = 0 require_full_window = false diff --git a/cloud/azure/eventhub/README.md b/cloud/azure/eventhub/README.md index eff0d4c..7bfc5f6 100644 --- a/cloud/azure/eventhub/README.md +++ b/cloud/azure/eventhub/README.md @@ -29,22 +29,16 @@ Inputs |------|-------------|:----:|:-----:|:-----:| | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | +| errors_rate_silenced | Groups to mute for Event Hub errors monitor | map | `` | no | | errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `90` | no | | errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `50` | no | +| failed_requests_rate_silenced | Groups to mute for Event Hub failed requests monitor | map | `` | no | | failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `90` | no | | failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `50` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when an alert is triggered | string | - | yes | - -Outputs -------- - -| Name | Description | -|------|-------------| -| errors_monitor_id | Id of the `errors` monitor | -| failed_requests_monitor_id | Id of the `failed requests` monitor | -| status_monitor_id | Id of the `status` monitor | +| status_silenced | Groups to mute for Event Hub status monitor | map | `` | no | Related documentation --------------------- diff --git a/cloud/azure/eventhub/inputs.tf b/cloud/azure/eventhub/inputs.tf index 573d6f7..3d60a29 100644 --- a/cloud/azure/eventhub/inputs.tf +++ b/cloud/azure/eventhub/inputs.tf @@ -24,6 +24,18 @@ variable "filter_tags_custom" { default = "*" } +variable "status_silenced" { + description = "Groups to mute for Event Hub status monitor" + type = "map" + default = {} +} + +variable "failed_requests_rate_silenced" { + description = "Groups to mute for Event Hub failed requests monitor" + type = "map" + default = {} +} + variable "failed_requests_rate_thresold_critical" { description = "Failed requests ratio (percentage) to trigger the critical alert" default = 90 @@ -34,6 +46,12 @@ variable "failed_requests_rate_thresold_warning" { default = 50 } +variable "errors_rate_silenced" { + description = "Groups to mute for Event Hub errors monitor" + type = "map" + default = {} +} + variable "errors_rate_thresold_critical" { description = "Errors ratio (percentage) to trigger the critical alert" default = 90 diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf index 72b695f..0d93b95 100644 --- a/cloud/azure/eventhub/monitors-eventhub.tf +++ b/cloud/azure/eventhub/monitors-eventhub.tf @@ -16,6 +16,8 @@ resource "datadog_monitor" "eventhub_status" { type = "metric alert" + silenced = "${var.status_silenced}" + notify_no_data = true evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -50,6 +52,8 @@ resource "datadog_monitor" "eventhub_failed_requests" { warning = "${var.failed_requests_rate_thresold_warning}" } + silenced = "${var.failed_requests_rate_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -88,6 +92,8 @@ resource "datadog_monitor" "eventhub_errors" { warning = "${var.errors_rate_thresold_warning}" } + silenced = "${var.errors_rate_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 diff --git a/cloud/azure/eventhub/outputs.tf b/cloud/azure/eventhub/outputs.tf deleted file mode 100644 index b9d1822..0000000 --- a/cloud/azure/eventhub/outputs.tf +++ /dev/null @@ -1,11 +0,0 @@ -output "status_monitor_id" { - value = "${datadog_monitor.eventhub_failed_requests.id}" -} - -output "failed_requests_monitor_id" { - value = "${datadog_monitor.eventhub_status.id}" -} - -output "errors_monitor_id" { - value = "${datadog_monitor.eventhub_errors.id}" -} diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf index ae37575..c564080 100644 --- a/cloud/azure/inputs.tf +++ b/cloud/azure/inputs.tf @@ -29,6 +29,18 @@ variable "non_taggable_filter_tags" { } # Azure API Management specific variables +variable "apimanagement_status_silenced" { + description = "Groups to mute for API Management status monitor" + type = "map" + default = {} +} + +variable "apimanagement_failed_requests_silenced" { + description = "Groups to mute for API Management failed requests monitor" + type = "map" + default = {} +} + variable "apimanagement_failed_requests_threshold_critical" { description = "Maximum acceptable percent of failed requests" default = 90 @@ -39,6 +51,12 @@ variable "apimanagement_failed_requests_threshold_warning" { default = 50 } +variable "apimanagement_other_requests_silenced" { + description = "Groups to mute for API Management other requests monitor" + type = "map" + default = {} +} + variable "apimanagement_other_requests_threshold_critical" { description = "Maximum acceptable percent of other requests" default = 90 @@ -49,6 +67,12 @@ variable "apimanagement_other_requests_threshold_warning" { default = 50 } +variable "apimanagement_unauthorized_requests_silenced" { + description = "Groups to mute for API Management unauthorized requests monitor" + type = "map" + default = {} +} + variable "apimanagement_unauthorized_requests_threshold_critical" { description = "Maximum acceptable percent of unauthorized requests" default = 90 @@ -59,6 +83,12 @@ variable "apimanagement_unauthorized_requests_threshold_warning" { default = 50 } +variable "apimanagement_successful_requests_silenced" { + description = "Groups to mute for API Management successful requests monitor" + type = "map" + default = {} +} + variable "apimanagement_successful_requests_threshold_critical" { description = "Minimum acceptable percent of successful requests" default = 10 @@ -70,14 +100,26 @@ variable "apimanagement_successful_requests_threshold_warning" { } # Azure App Services specific variables +variable "appservices_response_time_silenced" { + description = "Groups to mute for App Services response time monitor" + type = "map" + default = {} +} + variable "appservices_response_time_threshold_critical" { default = 10 - description = "Alerting threshold in seconds" + description = "Alerting threshold for response time in seconds" } variable "appservices_response_time_threshold_warning" { default = 5 - description = "Warning threshold in seconds" + description = "Warning threshold for response time in seconds" +} + +variable "appservices_memory_usage_silenced" { + description = "Groups to mute for App Services memory usage monitor" + type = "map" + default = {} } variable "appservices_memory_usage_threshold_critical" { @@ -90,6 +132,12 @@ variable "appservices_memory_usage_threshold_warning" { description = "Warning threshold in MiB" } +variable "appservices_http_4xx_requests_silenced" { + description = "Groups to mute for App Services 4xx requests monitor" + type = "map" + default = {} +} + variable "appservices_http_4xx_requests_threshold_critical" { default = 90 description = "Maximum critical acceptable percent of 4xx errors" @@ -100,6 +148,12 @@ variable "appservices_http_4xx_requests_threshold_warning" { description = "Warning regarding acceptable percent of 4xx errors" } +variable "appservices_http_5xx_requests_silenced" { + description = "Groups to mute for App Services 5xx requests monitor" + type = "map" + default = {} +} + variable "appservices_http_5xx_requests_threshold_critical" { default = 90 description = "Maximum critical acceptable percent of 5xx errors" @@ -110,6 +164,12 @@ variable "appservices_http_5xx_requests_threshold_warning" { description = "Warning regarding acceptable percent of 5xx errors" } +variable "appservices_http_successful_requests_silenced" { + description = "Groups to mute for App Services successful requests monitor" + type = "map" + default = {} +} + variable "appservices_http_successful_requests_threshold_critical" { default = 10 description = "Minimum critical acceptable percent of 2xx & 3xx requests" @@ -121,6 +181,18 @@ variable "appservices_http_successful_requests_threshold_warning" { } # Azure Event Hub specific variables +variable "eventhub_status_silenced" { + description = "Groups to mute for Event Hub status monitor" + type = "map" + default = {} +} + +variable "eventhub_failed_requests_rate_silenced" { + description = "Groups to mute for Event Hub failed requests monitor" + type = "map" + default = {} +} + variable "eventhub_failed_requests_rate_thresold_critical" { description = "Failed requests ratio (percentage) to trigger the critical alert" default = 90 @@ -131,6 +203,12 @@ variable "eventhub_failed_requests_rate_thresold_warning" { default = 50 } +variable "eventhub_errors_rate_silenced" { + description = "Groups to mute for Event Hub errors monitor" + type = "map" + default = {} +} + variable "eventhub_errors_rate_thresold_critical" { description = "Errors ratio (percentage) to trigger the critical alert" default = 90 @@ -142,6 +220,30 @@ variable "eventhub_errors_rate_thresold_warning" { } # IOT Hub specific variables +variable "iothub_status_silenced" { + description = "Groups to mute for IoT Hub status monitor" + type = "map" + default = {} +} + +variable "iothub_total_devices_silenced" { + description = "Groups to mute for IoT Hub total device monitor" + type = "map" + default = {} +} + +variable "iothub_too_many_d2c_telemetry_ingress_nosent_silenced" { + description = "Groups to mute for IoT Hub unsent d2c telemetry monitor" + type = "map" + default = {} +} + +variable "iothub_failed_jobs_rate_silenced" { + description = "Groups to mute for IoT Hub failed jobs monitor" + type = "map" + default = {} +} + variable "iothub_failed_jobs_rate_threshold_warning" { description = "Jobs Failed rate limit (warning threshold)" default = 50 @@ -152,6 +254,12 @@ variable "iothub_failed_jobs_rate_threshold_critical" { default = 90 } +variable "iothub_failed_listjobs_rate_silenced" { + description = "Groups to mute for IoT Hub failed list jobs monitor" + type = "map" + default = {} +} + variable "iothub_failed_listjobs_rate_threshold_warning" { description = "ListJobs Failed rate limit (warning threshold)" default = 50 @@ -162,6 +270,12 @@ variable "iothub_failed_listjobs_rate_threshold_critical" { default = 90 } +variable "iothub_failed_queryjobs_rate_silenced" { + description = "Groups to mute for IoT Hub failed query jobs monitor" + type = "map" + default = {} +} + variable "iothub_failed_queryjobs_rate_threshold_warning" { description = "QueryJobs Failed rate limit (warning threshold)" default = 50 @@ -172,6 +286,12 @@ variable "iothub_failed_queryjobs_rate_threshold_critical" { default = 90 } +variable "iothub_failed_c2d_methods_rate_silenced" { + description = "Groups to mute for IoT Hub failed c2d methods monitor" + type = "map" + default = {} +} + variable "iothub_failed_c2d_methods_rate_threshold_warning" { description = "C2D Methods Failed rate limit (warning threshold)" default = 50 @@ -182,6 +302,12 @@ variable "iothub_failed_c2d_methods_rate_threshold_critical" { default = 90 } +variable "iothub_failed_c2d_twin_read_rate_silenced" { + description = "Groups to mute for IoT Hub failed c2d twin read monitor" + type = "map" + default = {} +} + variable "iothub_failed_c2d_twin_read_rate_threshold_warning" { description = "C2D Twin Read Failed rate limit (warning threshold)" default = 50 @@ -192,6 +318,12 @@ variable "iothub_failed_c2d_twin_read_rate_threshold_critical" { default = 90 } +variable "iothub_failed_c2d_twin_update_rate_silenced" { + description = "Groups to mute for IoT Hub failed c2d twin update monitor" + type = "map" + default = {} +} + variable "iothub_failed_c2d_twin_update_rate_threshold_warning" { description = "C2D Twin Update Failed rate limit (warning threshold)" default = 50 @@ -202,6 +334,12 @@ variable "iothub_failed_c2d_twin_update_rate_threshold_critical" { default = 90 } +variable "iothub_failed_d2c_twin_read_rate_silenced" { + description = "Groups to mute for IoT Hub failed d2c twin read monitor" + type = "map" + default = {} +} + variable "iothub_failed_d2c_twin_read_rate_threshold_warning" { description = "D2C Twin Read Failed rate limit (warning threshold)" default = 50 @@ -212,6 +350,12 @@ variable "iothub_failed_d2c_twin_read_rate_threshold_critical" { default = 90 } +variable "iothub_failed_d2c_twin_update_rate_silenced" { + description = "Groups to mute for IoT Hub failed d2c twin update monitor" + type = "map" + default = {} +} + variable "iothub_failed_d2c_twin_update_rate_threshold_warning" { description = "D2C Twin Update Failed rate limit (warning threshold)" default = 50 @@ -222,6 +366,12 @@ variable "iothub_failed_d2c_twin_update_rate_threshold_critical" { default = 90 } +variable "iothub_dropped_d2c_telemetry_egress_silenced" { + description = "Groups to mute for IoT Hub dropped d2c telemetry monitor" + type = "map" + default = {} +} + variable "iothub_dropped_d2c_telemetry_egress_threshold_warning" { description = "D2C Telemetry Dropped limit (warning threshold)" default = 500 @@ -232,6 +382,12 @@ variable "iothub_dropped_d2c_telemetry_egress_threshold_critical" { default = 1000 } +variable "iothub_orphaned_d2c_telemetry_egress_silenced" { + description = "Groups to mute for IoT Hub orphaned d2c telemetry monitor" + type = "map" + default = {} +} + variable "iothub_orphaned_d2c_telemetry_egress_threshold_warning" { description = "D2C Telemetry Orphaned limit (warning threshold)" default = 500 @@ -242,6 +398,12 @@ variable "iothub_orphaned_d2c_telemetry_egress_threshold_critical" { default = 1000 } +variable "iothub_invalid_d2c_telemetry_egress_silenced" { + description = "Groups to mute for IoT Hub invalid d2c telemetry monitor" + type = "map" + default = {} +} + variable "iothub_invalid_d2c_telemetry_egress_threshold_warning" { description = "D2C Telemetry Invalid limit (warning threshold)" default = 500 @@ -252,6 +414,12 @@ variable "iothub_invalid_d2c_telemetry_egress_threshold_critical" { default = 1000 } +variable "iothub_fallback_d2c_telemetry_egress_silenced" { + description = "Groups to mute for IoT Hub fallback d2c telemetry monitor" + type = "map" + default = {} +} + variable "iothub_fallback_d2c_telemetry_egress_threshold_warning" { description = "D2C Telemetry Fallback limit (warning threshold)" default = 500 @@ -263,6 +431,18 @@ variable "iothub_fallback_d2c_telemetry_egress_threshold_critical" { } # Azure Redis specific variables +variable "redis_status_silenced" { + description = "Groups to mute for Redis status monitor" + type = "map" + default = {} +} + +variable "redis_evictedkeys_limit_silenced" { + description = "Groups to mute for Redis evicted keys monitor" + type = "map" + default = {} +} + variable "redis_evictedkeys_limit_threshold_warning" { description = "Evicted keys limit (warning threshold)" default = 0 @@ -273,6 +453,12 @@ variable "redis_evictedkeys_limit_threshold_critical" { default = 100 } +variable "redis_percent_processor_time_silenced" { + description = "Groups to mute for Redis processor monitor" + type = "map" + default = {} +} + variable "redis_percent_processor_time_threshold_critical" { description = "Processor time percent (critical threshold)" default = 80 @@ -283,6 +469,12 @@ variable "redis_percent_processor_time_threshold_warning" { default = 60 } +variable "redis_server_load_rate_silenced" { + description = "Groups to mute for Redis server load monitor" + type = "map" + default = {} +} + variable "redis_server_load_rate_threshold_critical" { description = "Server CPU load rate (critical threshold)" default = 90 @@ -294,6 +486,12 @@ variable "redis_server_load_rate_threshold_warning" { } # Azure SQL Database specific variables +variable "sqldatabase_cpu_silenced" { + description = "Groups to mute for SQL CPU monitor" + type = "map" + default = {} +} + variable "sqldatabase_cpu_threshold_warning" { description = "CPU usage in percent (warning threshold)" default = "80" @@ -304,6 +502,12 @@ variable "sqldatabase_cpu_threshold_critical" { default = "90" } +variable "sqldatabase_diskspace_silenced" { + description = "Groups to mute for SQL disk space monitor" + type = "map" + default = {} +} + variable "sqldatabase_diskspace_threshold_warning" { description = "Disk space used in percent (warning threshold)" default = "80" @@ -314,6 +518,12 @@ variable "sqldatabase_diskspace_threshold_critical" { default = "90" } +variable "sqldatabase_dtu_silenced" { + description = "Groups to mute for SQL DTU monitor" + type = "map" + default = {} +} + variable "sqldatabase_dtu_threshold_warning" { description = "Amount of DTU used (warning threshold)" default = "85" @@ -324,12 +534,24 @@ variable "sqldatabase_dtu_threshold_critical" { default = "90" } +variable "sqldatabase_deadlock_silenced" { + description = "Groups to mute for SQL Deadlock monitor" + type = "map" + default = {} +} + variable "sqldatabase_deadlock_threshold_critical" { description = "Amount of Deadlocks (critical threshold)" default = "1" } # Azure Storage specific variables +variable "storage_availability_silenced" { + description = "Groups to mute for Storage availability monitor" + type = "map" + default = {} +} + variable "storage_availability_threshold_critical" { description = "Minimum acceptable percent of availability for a storage" default = 50 @@ -340,6 +562,12 @@ variable "storage_availability_threshold_warning" { default = 90 } +variable "storage_successful_requests_silenced" { + description = "Groups to mute for Storage sucessful requests monitor" + type = "map" + default = {} +} + variable "storage_successful_requests_threshold_critical" { description = "Minimum acceptable percent of successful requests for a storage" default = 10 @@ -350,6 +578,12 @@ variable "storage_successful_requests_threshold_warning" { default = 30 } +variable "storage_latency_silenced" { + description = "Groups to mute for Storage latency monitor" + type = "map" + default = {} +} + variable "storage_latency_threshold_critical" { description = "Maximum acceptable end to end latency (ms) for a storage" default = 2000 @@ -360,6 +594,12 @@ variable "storage_latency_threshold_warning" { default = 1000 } +variable "storage_timeout_error_requests_silenced" { + description = "Groups to mute for Storage timeout monitor" + type = "map" + default = {} +} + variable "storage_timeout_error_requests_threshold_critical" { description = "Maximum acceptable percent of timeout error requests for a storage" default = 90 @@ -370,6 +610,12 @@ variable "storage_timeout_error_requests_threshold_warning" { default = 50 } +variable "storage_network_error_requests_silenced" { + description = "Groups to mute for Storage network errors monitor" + type = "map" + default = {} +} + variable "storage_network_error_requests_threshold_critical" { description = "Maximum acceptable percent of network error requests for a storage" default = 90 @@ -380,6 +626,12 @@ variable "storage_network_error_requests_threshold_warning" { default = 50 } +variable "storage_throttling_error_requests_silenced" { + description = "Groups to mute for Storage throttling error monitor" + type = "map" + default = {} +} + variable "storage_throttling_error_requests_threshold_critical" { description = "Maximum acceptable percent of throttling error requests for a storage" default = 90 @@ -390,6 +642,12 @@ variable "storage_throttling_error_requests_threshold_warning" { default = 50 } +variable "storage_server_other_error_requests_silenced" { + description = "Groups to mute for Storage server other errors monitor" + type = "map" + default = {} +} + variable "storage_server_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of server other error requests for a storage" default = 90 @@ -400,6 +658,12 @@ variable "storage_server_other_error_requests_threshold_warning" { default = 50 } +variable "storage_client_other_error_requests_silenced" { + description = "Groups to mute for Storage other errors monitor" + type = "map" + default = {} +} + variable "storage_client_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of client other error requests for a storage" default = 90 @@ -410,6 +674,12 @@ variable "storage_client_other_error_requests_threshold_warning" { default = 50 } +variable "storage_authorization_error_requests_silenced" { + description = "Groups to mute for Storage authorization errors monitor" + type = "map" + default = {} +} + variable "storage_authorization_error_requests_threshold_critical" { description = "Maximum acceptable percent of authorization error requests for a storage" default = 90 @@ -421,6 +691,18 @@ variable "storage_authorization_error_requests_threshold_warning" { } # Azure Stream Analytics specific variables +variable "streamanalytics_status_silenced" { + description = "Groups to mute for Stream Analytics status monitor" + type = "map" + default = {} +} + +variable "streamanalytics_su_utilization_silenced" { + description = "Groups to mute for Stream Analytics utilization monitor" + type = "map" + default = {} +} + variable "streamanalytics_su_utilization_threshold_warning" { description = "Streaming Unit utilization rate limit (warning threshold)" default = 60 @@ -431,7 +713,13 @@ variable "streamanalytics_su_utilization_threshold_critical" { default = 80 } -variable "streamanalytics_function_requests_threshold_warning" { +variable "streamanalytics_failed_function_requests_silenced" { + description = "Groups to mute for Stream Analytics failed requests monitor" + type = "map" + default = {} +} + +variable "streamanalytics_failed_function_requests_threshold_warning" { description = "Failed Function Request rate limit (warning threshold)" default = 0 } @@ -441,6 +729,12 @@ variable "streamanalytics_failed_function_requests_threshold_critical" { default = 10 } +variable "streamanalytics_conversion_errors_silenced" { + description = "Groups to mute for Stream Analytics conversion errors monitor" + type = "map" + default = {} +} + variable "streamanalytics_conversion_errors_threshold_warning" { description = "Conversion errors limit (warning threshold)" default = 0 @@ -451,6 +745,12 @@ variable "streamanalytics_conversion_errors_threshold_critical" { default = 10 } +variable "streamanalytics_runtime_errors_silenced" { + description = "Groups to mute for Stream Analytics runtime errors monitor" + type = "map" + default = {} +} + variable "streamanalytics_runtime_errors_threshold_warning" { description = "Runtime errors limit (warning threshold)" default = 0 diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md index 4b29bdc..e8b799a 100644 --- a/cloud/azure/iothubs/README.md +++ b/cloud/azure/iothubs/README.md @@ -39,33 +39,48 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| | delay | Delay in seconds for the metric evaluation | string | `600` | no | +| dropped_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub dropped d2c telemetry monitor | map | `` | no | | dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no | | dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no | | environment | Architecture Environment | string | - | yes | +| failed_c2d_methods_rate_silenced | Groups to mute for IoT Hub failed c2d methods monitor | map | `` | no | | failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `90` | no | | failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `50` | no | +| failed_c2d_twin_read_rate_silenced | Groups to mute for IoT Hub failed c2d twin read monitor | map | `` | no | | failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `90` | no | | failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `50` | no | +| failed_c2d_twin_update_rate_silenced | Groups to mute for IoT Hub failed c2d twin update monitor | map | `` | no | | failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `90` | no | | failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `50` | no | +| failed_d2c_twin_read_rate_silenced | Groups to mute for IoT Hub failed d2c twin read monitor | map | `` | no | | failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `90` | no | | failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `50` | no | +| failed_d2c_twin_update_rate_silenced | Groups to mute for IoT Hub failed d2c twin update monitor | map | `` | no | | failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `90` | no | | failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `50` | no | +| failed_jobs_rate_silenced | Groups to mute for IoT Hub failed jobs monitor | map | `` | no | | failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `90` | no | | failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `50` | no | +| failed_listjobs_rate_silenced | Groups to mute for IoT Hub failed list jobs monitor | map | `` | no | | failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `90` | no | | failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `50` | no | +| failed_queryjobs_rate_silenced | Groups to mute for IoT Hub failed query jobs monitor | map | `` | no | | failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `90` | no | | failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no | +| fallback_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub fallback d2c telemetry monitor | map | `` | no | | fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no | | fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no | | filter_tags | Tags used for filtering | string | `*` | no | +| invalid_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub invalid d2c telemetry monitor | map | `` | no | | invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no | | invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no | | message | Message sent when an alert is triggered | string | - | yes | +| orphaned_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub orphaned d2c telemetry monitor | map | `` | no | | orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no | | orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no | +| status_silenced | Groups to mute for IoT Hub status monitor | map | `` | no | +| too_many_d2c_telemetry_ingress_nosent_silenced | Groups to mute for IoT Hub unsent d2c telemetry monitor | map | `` | no | +| total_devices_silenced | Groups to mute for IoT Hub total device monitor | map | `` | no | Related documentation --------------------- diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index f9f1844..2eaaefc 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -20,6 +20,30 @@ variable "filter_tags" { } # Azure IOT hubs specific +variable "status_silenced" { + description = "Groups to mute for IoT Hub status monitor" + type = "map" + default = {} +} + +variable "total_devices_silenced" { + description = "Groups to mute for IoT Hub total device monitor" + type = "map" + default = {} +} + +variable "too_many_d2c_telemetry_ingress_nosent_silenced" { + description = "Groups to mute for IoT Hub unsent d2c telemetry monitor" + type = "map" + default = {} +} + +variable "failed_jobs_rate_silenced" { + description = "Groups to mute for IoT Hub failed jobs monitor" + type = "map" + default = {} +} + variable "failed_jobs_rate_threshold_warning" { description = "Jobs Failed rate limit (warning threshold)" default = 50 @@ -30,6 +54,12 @@ variable "failed_jobs_rate_threshold_critical" { default = 90 } +variable "failed_listjobs_rate_silenced" { + description = "Groups to mute for IoT Hub failed list jobs monitor" + type = "map" + default = {} +} + variable "failed_listjobs_rate_threshold_warning" { description = "ListJobs Failed rate limit (warning threshold)" default = 50 @@ -40,6 +70,12 @@ variable "failed_listjobs_rate_threshold_critical" { default = 90 } +variable "failed_queryjobs_rate_silenced" { + description = "Groups to mute for IoT Hub failed query jobs monitor" + type = "map" + default = {} +} + variable "failed_queryjobs_rate_threshold_warning" { description = "QueryJobs Failed rate limit (warning threshold)" default = 50 @@ -50,6 +86,12 @@ variable "failed_queryjobs_rate_threshold_critical" { default = 90 } +variable "failed_c2d_methods_rate_silenced" { + description = "Groups to mute for IoT Hub failed c2d methods monitor" + type = "map" + default = {} +} + variable "failed_c2d_methods_rate_threshold_warning" { description = "C2D Methods Failed rate limit (warning threshold)" default = 50 @@ -60,6 +102,12 @@ variable "failed_c2d_methods_rate_threshold_critical" { default = 90 } +variable "failed_c2d_twin_read_rate_silenced" { + description = "Groups to mute for IoT Hub failed c2d twin read monitor" + type = "map" + default = {} +} + variable "failed_c2d_twin_read_rate_threshold_warning" { description = "C2D Twin Read Failed rate limit (warning threshold)" default = 50 @@ -70,6 +118,12 @@ variable "failed_c2d_twin_read_rate_threshold_critical" { default = 90 } +variable "failed_c2d_twin_update_rate_silenced" { + description = "Groups to mute for IoT Hub failed c2d twin update monitor" + type = "map" + default = {} +} + variable "failed_c2d_twin_update_rate_threshold_warning" { description = "C2D Twin Update Failed rate limit (warning threshold)" default = 50 @@ -80,6 +134,12 @@ variable "failed_c2d_twin_update_rate_threshold_critical" { default = 90 } +variable "failed_d2c_twin_read_rate_silenced" { + description = "Groups to mute for IoT Hub failed d2c twin read monitor" + type = "map" + default = {} +} + variable "failed_d2c_twin_read_rate_threshold_warning" { description = "D2C Twin Read Failed rate limit (warning threshold)" default = 50 @@ -90,6 +150,12 @@ variable "failed_d2c_twin_read_rate_threshold_critical" { default = 90 } +variable "failed_d2c_twin_update_rate_silenced" { + description = "Groups to mute for IoT Hub failed d2c twin update monitor" + type = "map" + default = {} +} + variable "failed_d2c_twin_update_rate_threshold_warning" { description = "D2C Twin Update Failed rate limit (warning threshold)" default = 50 @@ -100,6 +166,12 @@ variable "failed_d2c_twin_update_rate_threshold_critical" { default = 90 } +variable "dropped_d2c_telemetry_egress_silenced" { + description = "Groups to mute for IoT Hub dropped d2c telemetry monitor" + type = "map" + default = {} +} + variable "dropped_d2c_telemetry_egress_threshold_warning" { description = "D2C Telemetry Dropped limit (warning threshold)" default = 500 @@ -110,6 +182,12 @@ variable "dropped_d2c_telemetry_egress_threshold_critical" { default = 1000 } +variable "orphaned_d2c_telemetry_egress_silenced" { + description = "Groups to mute for IoT Hub orphaned d2c telemetry monitor" + type = "map" + default = {} +} + variable "orphaned_d2c_telemetry_egress_threshold_warning" { description = "D2C Telemetry Orphaned limit (warning threshold)" default = 500 @@ -120,6 +198,12 @@ variable "orphaned_d2c_telemetry_egress_threshold_critical" { default = 1000 } +variable "invalid_d2c_telemetry_egress_silenced" { + description = "Groups to mute for IoT Hub invalid d2c telemetry monitor" + type = "map" + default = {} +} + variable "invalid_d2c_telemetry_egress_threshold_warning" { description = "D2C Telemetry Invalid limit (warning threshold)" default = 500 @@ -130,6 +214,12 @@ variable "invalid_d2c_telemetry_egress_threshold_critical" { default = 1000 } +variable "fallback_d2c_telemetry_egress_silenced" { + description = "Groups to mute for IoT Hub fallback d2c telemetry monitor" + type = "map" + default = {} +} + variable "fallback_d2c_telemetry_egress_threshold_warning" { description = "D2C Telemetry Fallback limit (warning threshold)" default = 500 diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 3b2a2ca..35cc12b 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -17,6 +17,8 @@ resource "datadog_monitor" "too_many_jobs_failed" { critical = "${var.failed_jobs_rate_threshold_critical}" } + silenced = "${var.failed_jobs_rate_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -50,6 +52,8 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { critical = "${var.failed_listjobs_rate_threshold_critical}" } + silenced = "${var.failed_listjobs_rate_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -83,6 +87,8 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { critical = "${var.failed_queryjobs_rate_threshold_critical}" } + silenced = "${var.failed_queryjobs_rate_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -107,6 +113,8 @@ resource "datadog_monitor" "status" { type = "metric alert" + silenced = "${var.status_silenced}" + notify_no_data = true evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -131,6 +139,8 @@ resource "datadog_monitor" "total_devices" { type = "metric alert" + silenced = "${var.total_devices_silenced}" + notify_no_data = true evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -164,6 +174,8 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { critical = "${var.failed_c2d_methods_rate_threshold_critical}" } + silenced = "${var.failed_c2d_methods_rate_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -197,6 +209,8 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { critical = "${var.failed_c2d_twin_read_rate_threshold_critical}" } + silenced = "${var.failed_c2d_twin_read_rate_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -230,6 +244,8 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { critical = "${var.failed_c2d_twin_update_rate_threshold_critical}" } + silenced = "${var.failed_c2d_twin_update_rate_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -263,6 +279,8 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { critical = "${var.failed_d2c_twin_read_rate_threshold_critical}" } + silenced = "${var.failed_d2c_twin_read_rate_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -296,6 +314,8 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { critical = "${var.failed_d2c_twin_update_rate_threshold_critical}" } + silenced = "${var.failed_d2c_twin_update_rate_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -327,6 +347,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { critical = "${var.dropped_d2c_telemetry_egress_threshold_critical}" } + silenced = "${var.dropped_d2c_telemetry_egress_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -358,6 +380,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { critical = "${var.orphaned_d2c_telemetry_egress_threshold_critical}" } + silenced = "${var.orphaned_d2c_telemetry_egress_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -389,6 +413,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { critical = "${var.invalid_d2c_telemetry_egress_threshold_critical}" } + silenced = "${var.invalid_d2c_telemetry_egress_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -420,6 +446,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { critical = "${var.fallback_d2c_telemetry_egress_threshold_critical}" } + silenced = "${var.fallback_d2c_telemetry_egress_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -447,6 +475,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { type = "metric alert" + silenced = "${var.too_many_d2c_telemetry_ingress_nosent_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 diff --git a/cloud/azure/monitors.tf b/cloud/azure/monitors.tf index 073cab9..244050e 100644 --- a/cloud/azure/monitors.tf +++ b/cloud/azure/monitors.tf @@ -8,12 +8,17 @@ module "apimanagement" { filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" + status_silenced = "${var.apimanagement_status_silenced}" + failed_requests_silenced = "${var.apimanagement_failed_requests_silenced}" failed_requests_threshold_critical = "${var.apimanagement_failed_requests_threshold_critical}" failed_requests_threshold_warning = "${var.apimanagement_failed_requests_threshold_warning}" + other_requests_silenced = "${var.apimanagement_other_requests_silenced}" other_requests_threshold_critical = "${var.apimanagement_other_requests_threshold_critical}" other_requests_threshold_warning = "${var.apimanagement_other_requests_threshold_warning}" + successful_requests_silenced = "${var.apimanagement_successful_requests_silenced}" successful_requests_threshold_critical = "${var.apimanagement_successful_requests_threshold_critical}" successful_requests_threshold_warning = "${var.apimanagement_successful_requests_threshold_warning}" + unauthorized_requests_silenced = "${var.apimanagement_unauthorized_requests_silenced}" unauthorized_requests_threshold_critical = "${var.apimanagement_unauthorized_requests_threshold_critical}" unauthorized_requests_threshold_warning = "${var.apimanagement_unauthorized_requests_threshold_warning}" } @@ -28,14 +33,19 @@ module "appservices" { filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" + http_successful_requests_silenced = "${var.appservices_http_successful_requests_silenced}" http_successful_requests_threshold_critical = "${var.appservices_http_successful_requests_threshold_critical}" http_successful_requests_threshold_warning = "${var.appservices_http_successful_requests_threshold_warning}" + http_5xx_requests_silenced = "${var.appservices_http_5xx_requests_silenced}" http_5xx_requests_threshold_critical = "${var.appservices_http_5xx_requests_threshold_critical}" http_5xx_requests_threshold_warning = "${var.appservices_http_5xx_requests_threshold_warning}" + http_4xx_requests_silenced = "${var.appservices_http_4xx_requests_silenced}" http_4xx_requests_threshold_critical = "${var.appservices_http_4xx_requests_threshold_critical}" http_4xx_requests_threshold_warning = "${var.appservices_http_4xx_requests_threshold_warning}" + memory_usage_silenced = "${var.appservices_memory_usage_silenced}" memory_usage_threshold_critical = "${var.appservices_memory_usage_threshold_critical}" memory_usage_threshold_warning = "${var.appservices_memory_usage_threshold_warning}" + response_time_silenced = "${var.appservices_response_time_silenced}" response_time_threshold_critical = "${var.appservices_response_time_threshold_critical}" response_time_threshold_warning = "${var.appservices_response_time_threshold_warning}" } @@ -50,8 +60,11 @@ module "eventhub" { filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" + status_silenced = "${var.eventhub_status_silenced}" + errors_rate_silenced = "${var.eventhub_errors_rate_silenced}" errors_rate_thresold_critical = "${var.eventhub_errors_rate_thresold_critical}" errors_rate_thresold_warning = "${var.eventhub_errors_rate_thresold_warning}" + failed_requests_rate_silenced = "${var.eventhub_failed_requests_rate_silenced}" failed_requests_rate_thresold_critical = "${var.eventhub_failed_requests_rate_thresold_critical}" failed_requests_rate_thresold_warning = "${var.eventhub_failed_requests_rate_thresold_warning}" } @@ -65,28 +78,43 @@ module "iothub" { filter_tags = "${var.non_taggable_filter_tags}" + status_silenced = "${var.iothub_status_silenced}" + total_devices_silenced = "${var.iothub_total_devices_silenced}" + too_many_d2c_telemetry_ingress_nosent_silenced = "${var.iothub_too_many_d2c_telemetry_ingress_nosent_silenced}" + dropped_d2c_telemetry_egress_silenced = "${var.iothub_dropped_d2c_telemetry_egress_silenced}" dropped_d2c_telemetry_egress_threshold_critical = "${var.iothub_dropped_d2c_telemetry_egress_threshold_critical}" dropped_d2c_telemetry_egress_threshold_warning = "${var.iothub_dropped_d2c_telemetry_egress_threshold_warning}" + failed_c2d_methods_rate_silenced = "${var.iothub_failed_c2d_methods_rate_silenced}" failed_c2d_methods_rate_threshold_critical = "${var.iothub_failed_c2d_methods_rate_threshold_critical}" failed_c2d_methods_rate_threshold_warning = "${var.iothub_failed_c2d_methods_rate_threshold_warning}" + failed_c2d_twin_read_rate_silenced = "${var.iothub_failed_c2d_twin_read_rate_silenced}" failed_c2d_twin_read_rate_threshold_critical = "${var.iothub_failed_c2d_twin_read_rate_threshold_critical}" failed_c2d_twin_read_rate_threshold_warning = "${var.iothub_failed_c2d_twin_read_rate_threshold_warning}" + failed_c2d_twin_update_rate_silenced = "${var.iothub_failed_c2d_twin_update_rate_silenced}" failed_c2d_twin_update_rate_threshold_critical = "${var.iothub_failed_c2d_twin_update_rate_threshold_critical}" failed_c2d_twin_update_rate_threshold_warning = "${var.iothub_failed_c2d_twin_update_rate_threshold_warning}" + failed_d2c_twin_read_rate_silenced = "${var.iothub_failed_d2c_twin_read_rate_silenced}" failed_d2c_twin_read_rate_threshold_critical = "${var.iothub_failed_d2c_twin_read_rate_threshold_critical}" failed_d2c_twin_read_rate_threshold_warning = "${var.iothub_failed_d2c_twin_read_rate_threshold_warning}" + failed_d2c_twin_update_rate_silenced = "${var.iothub_failed_d2c_twin_update_rate_silenced}" failed_d2c_twin_update_rate_threshold_critical = "${var.iothub_failed_d2c_twin_update_rate_threshold_critical}" failed_d2c_twin_update_rate_threshold_warning = "${var.iothub_failed_d2c_twin_update_rate_threshold_warning}" + failed_jobs_rate_silenced = "${var.iothub_failed_jobs_rate_silenced}" failed_jobs_rate_threshold_critical = "${var.iothub_failed_jobs_rate_threshold_critical}" failed_jobs_rate_threshold_warning = "${var.iothub_failed_jobs_rate_threshold_warning}" + failed_listjobs_rate_silenced = "${var.iothub_failed_listjobs_rate_silenced}" failed_listjobs_rate_threshold_critical = "${var.iothub_failed_listjobs_rate_threshold_critical}" failed_listjobs_rate_threshold_warning = "${var.iothub_failed_listjobs_rate_threshold_warning}" + failed_queryjobs_rate_silenced = "${var.iothub_failed_queryjobs_rate_silenced}" failed_queryjobs_rate_threshold_critical = "${var.iothub_failed_queryjobs_rate_threshold_critical}" failed_queryjobs_rate_threshold_warning = "${var.iothub_failed_queryjobs_rate_threshold_warning}" + fallback_d2c_telemetry_egress_silenced = "${var.iothub_fallback_d2c_telemetry_egress_silenced}" fallback_d2c_telemetry_egress_threshold_critical = "${var.iothub_fallback_d2c_telemetry_egress_threshold_critical}" fallback_d2c_telemetry_egress_threshold_warning = "${var.iothub_fallback_d2c_telemetry_egress_threshold_warning}" + invalid_d2c_telemetry_egress_silenced = "${var.iothub_invalid_d2c_telemetry_egress_silenced}" invalid_d2c_telemetry_egress_threshold_critical = "${var.iothub_invalid_d2c_telemetry_egress_threshold_critical}" invalid_d2c_telemetry_egress_threshold_warning = "${var.iothub_invalid_d2c_telemetry_egress_threshold_warning}" + orphaned_d2c_telemetry_egress_silenced = "${var.iothub_orphaned_d2c_telemetry_egress_silenced}" orphaned_d2c_telemetry_egress_threshold_critical = "${var.iothub_orphaned_d2c_telemetry_egress_threshold_critical}" orphaned_d2c_telemetry_egress_threshold_warning = "${var.iothub_orphaned_d2c_telemetry_egress_threshold_warning}" } @@ -101,10 +129,14 @@ module "redis" { filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" + status_silenced = "${var.redis_status_silenced}" + evictedkeys_limit_silenced = "${var.redis_evictedkeys_limit_silenced}" evictedkeys_limit_threshold_critical = "${var.redis_evictedkeys_limit_threshold_critical}" evictedkeys_limit_threshold_warning = "${var.redis_evictedkeys_limit_threshold_warning}" + percent_processor_time_silenced = "${var.redis_percent_processor_time_silenced}" percent_processor_time_threshold_critical = "${var.redis_percent_processor_time_threshold_critical}" percent_processor_time_threshold_warning = "${var.redis_percent_processor_time_threshold_warning}" + server_load_rate_silenced = "${var.redis_server_load_rate_silenced}" server_load_rate_threshold_critical = "${var.redis_server_load_rate_threshold_critical}" server_load_rate_threshold_warning = "${var.redis_server_load_rate_threshold_warning}" } @@ -119,11 +151,15 @@ module "sqldatabase" { filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" + cpu_silenced = "${var.sqldatabase_cpu_silenced}" cpu_threshold_critical = "${var.sqldatabase_cpu_threshold_critical}" cpu_threshold_warning = "${var.sqldatabase_cpu_threshold_warning}" + deadlock_silenced = "${var.sqldatabase_deadlock_silenced}" deadlock_threshold_critical = "${var.sqldatabase_deadlock_threshold_critical}" + diskspace_silenced = "${var.sqldatabase_diskspace_silenced}" diskspace_threshold_critical = "${var.sqldatabase_diskspace_threshold_critical}" diskspace_threshold_warning = "${var.sqldatabase_diskspace_threshold_warning}" + dtu_silenced = "${var.sqldatabase_dtu_silenced}" dtu_threshold_critical = "${var.sqldatabase_dtu_threshold_critical}" dtu_threshold_warning = "${var.sqldatabase_dtu_threshold_warning}" } @@ -138,22 +174,31 @@ module "storage" { filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" + authorization_error_requests_silenced = "${var.storage_authorization_error_requests_silenced}" authorization_error_requests_threshold_critical = "${var.storage_authorization_error_requests_threshold_critical}" authorization_error_requests_threshold_warning = "${var.storage_authorization_error_requests_threshold_warning}" + availability_silenced = "${var.storage_availability_silenced}" availability_threshold_critical = "${var.storage_availability_threshold_critical}" availability_threshold_warning = "${var.storage_availability_threshold_warning}" + client_other_error_requests_silenced = "${var.storage_client_other_error_requests_silenced}" client_other_error_requests_threshold_critical = "${var.storage_client_other_error_requests_threshold_critical}" client_other_error_requests_threshold_warning = "${var.storage_client_other_error_requests_threshold_warning}" + latency_silenced = "${var.storage_latency_silenced}" latency_threshold_critical = "${var.storage_latency_threshold_critical}" latency_threshold_warning = "${var.storage_latency_threshold_warning}" + network_error_requests_silenced = "${var.storage_network_error_requests_silenced}" network_error_requests_threshold_critical = "${var.storage_network_error_requests_threshold_critical}" network_error_requests_threshold_warning = "${var.storage_network_error_requests_threshold_warning}" + server_other_error_requests_silenced = "${var.storage_server_other_error_requests_silenced}" server_other_error_requests_threshold_critical = "${var.storage_server_other_error_requests_threshold_critical}" server_other_error_requests_threshold_warning = "${var.storage_server_other_error_requests_threshold_warning}" + successful_requests_silenced = "${var.storage_successful_requests_silenced}" successful_requests_threshold_critical = "${var.storage_successful_requests_threshold_critical}" successful_requests_threshold_warning = "${var.storage_successful_requests_threshold_warning}" + throttling_error_requests_silenced = "${var.storage_throttling_error_requests_silenced}" throttling_error_requests_threshold_critical = "${var.storage_throttling_error_requests_threshold_critical}" throttling_error_requests_threshold_warning = "${var.storage_throttling_error_requests_threshold_warning}" + timeout_error_requests_silenced = "${var.storage_timeout_error_requests_silenced}" timeout_error_requests_threshold_critical = "${var.storage_timeout_error_requests_threshold_critical}" timeout_error_requests_threshold_warning = "${var.storage_timeout_error_requests_threshold_warning}" } @@ -168,12 +213,16 @@ module "streamanalytics" { filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" + conversion_errors_silenced = "${var.streamanalytics_conversion_errors_silenced}" conversion_errors_threshold_critical = "${var.streamanalytics_conversion_errors_threshold_critical}" conversion_errors_threshold_warning = "${var.streamanalytics_conversion_errors_threshold_warning}" + failed_function_requests_silenced = "${var.streamanalytics_failed_function_requests_silenced}" failed_function_requests_threshold_critical = "${var.streamanalytics_failed_function_requests_threshold_critical}" - function_requests_threshold_warning = "${var.streamanalytics_function_requests_threshold_warning}" + failed_function_requests_threshold_warning = "${var.streamanalytics_failed_function_requests_threshold_warning}" + runtime_errors_silenced = "${var.streamanalytics_runtime_errors_silenced}" runtime_errors_threshold_critical = "${var.streamanalytics_runtime_errors_threshold_critical}" runtime_errors_threshold_warning = "${var.streamanalytics_runtime_errors_threshold_warning}" + su_utilization_silenced = "${var.streamanalytics_su_utilization_silenced}" su_utilization_threshold_critical = "${var.streamanalytics_su_utilization_threshold_critical}" su_utilization_threshold_warning = "${var.streamanalytics_su_utilization_threshold_warning}" } diff --git a/cloud/azure/redis/README.md b/cloud/azure/redis/README.md index 4cd7a51..c1845ce 100644 --- a/cloud/azure/redis/README.md +++ b/cloud/azure/redis/README.md @@ -29,13 +29,16 @@ Inputs |------|-------------|:----:|:-----:|:-----:| | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | +| evictedkeys_limit_silenced | Groups to mute for Redis evicted keys monitor | map | `` | no | | evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no | | evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when a Redis monitor is triggered | string | - | yes | +| percent_processor_time_silenced | Groups to mute for Redis processor monitor | map | `` | no | | percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no | | percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no | +| server_load_rate_silenced | Groups to mute for Redis server load monitor | map | `` | no | | server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no | | server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no | diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf index 18f0448..79a8592 100644 --- a/cloud/azure/redis/inputs.tf +++ b/cloud/azure/redis/inputs.tf @@ -25,6 +25,18 @@ variable "filter_tags_custom" { } # Azure Redis specific +variable "status_silenced" { + description = "Groups to mute for Redis status monitor" + type = "map" + default = {} +} + +variable "evictedkeys_limit_silenced" { + description = "Groups to mute for Redis evicted keys monitor" + type = "map" + default = {} +} + variable "evictedkeys_limit_threshold_warning" { description = "Evicted keys limit (warning threshold)" default = 0 @@ -35,6 +47,12 @@ variable "evictedkeys_limit_threshold_critical" { default = 100 } +variable "percent_processor_time_silenced" { + description = "Groups to mute for Redis processor monitor" + type = "map" + default = {} +} + variable "percent_processor_time_threshold_critical" { description = "Processor time percent (critical threshold)" default = 80 @@ -45,6 +63,12 @@ variable "percent_processor_time_threshold_warning" { default = 60 } +variable "server_load_rate_silenced" { + description = "Groups to mute for Redis server load monitor" + type = "map" + default = {} +} + variable "server_load_rate_threshold_critical" { description = "Server CPU load rate (critical threshold)" default = 90 diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf index ffde7bd..7b9ad62 100644 --- a/cloud/azure/redis/monitors-azure-redis.tf +++ b/cloud/azure/redis/monitors-azure-redis.tf @@ -16,6 +16,8 @@ EOF type = "metric alert" + silenced = "${var.status_silenced}" + notify_no_data = true evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -47,6 +49,8 @@ EOF critical = "${var.evictedkeys_limit_threshold_critical}" } + silenced = "${var.evictedkeys_limit_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -78,6 +82,8 @@ EOF critical = "${var.percent_processor_time_threshold_critical}" } + silenced = "${var.percent_processor_time_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -109,6 +115,8 @@ EOF critical = "${var.server_load_rate_threshold_critical}" } + silenced = "${var.server_load_rate_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 diff --git a/cloud/azure/sql-database/README.md b/cloud/azure/sql-database/README.md index 8f42bde..833d892 100644 --- a/cloud/azure/sql-database/README.md +++ b/cloud/azure/sql-database/README.md @@ -27,12 +27,16 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| +| cpu_silenced | Groups to mute for SQL CPU monitor | map | `` | no | | cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | | cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no | +| deadlock_silenced | Groups to mute for SQL Deadlock monitor | map | `` | no | | deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | +| diskspace_silenced | Groups to mute for SQL disk space monitor | map | `` | no | | diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no | | diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no | +| dtu_silenced | Groups to mute for SQL DTU monitor | map | `` | no | | dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no | | dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no | | environment | Architecture Environment | string | - | yes | diff --git a/cloud/azure/sql-database/inputs.tf b/cloud/azure/sql-database/inputs.tf index aa81cfb..58be3d4 100644 --- a/cloud/azure/sql-database/inputs.tf +++ b/cloud/azure/sql-database/inputs.tf @@ -25,6 +25,11 @@ variable "filter_tags_custom" { } # Azure SQL Database specific +variable "cpu_silenced" { + description = "Groups to mute for SQL CPU monitor" + type = "map" + default = {} +} variable "cpu_threshold_warning" { description = "CPU usage in percent (warning threshold)" @@ -36,6 +41,12 @@ variable "cpu_threshold_critical" { default = "90" } +variable "diskspace_silenced" { + description = "Groups to mute for SQL disk space monitor" + type = "map" + default = {} +} + variable "diskspace_threshold_warning" { description = "Disk space used in percent (warning threshold)" default = "80" @@ -46,6 +57,12 @@ variable "diskspace_threshold_critical" { default = "90" } +variable "dtu_silenced" { + description = "Groups to mute for SQL DTU monitor" + type = "map" + default = {} +} + variable "dtu_threshold_warning" { description = "Amount of DTU used (warning threshold)" default = "85" @@ -56,6 +73,12 @@ variable "dtu_threshold_critical" { default = "90" } +variable "deadlock_silenced" { + description = "Groups to mute for SQL Deadlock monitor" + type = "map" + default = {} +} + variable "deadlock_threshold_critical" { description = "Amount of Deadlocks (critical threshold)" default = "1" diff --git a/cloud/azure/sql-database/monitors-sql-database-basics.tf b/cloud/azure/sql-database/monitors-sql-database-basics.tf index e33b4d5..6df1cd3 100644 --- a/cloud/azure/sql-database/monitors-sql-database-basics.tf +++ b/cloud/azure/sql-database/monitors-sql-database-basics.tf @@ -22,6 +22,8 @@ resource "datadog_monitor" "sql-database_cpu_90_15min" { critical = "${var.cpu_threshold_critical}" } + silenced = "${var.cpu_silenced}" + notify_no_data = true evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -53,6 +55,8 @@ resource "datadog_monitor" "sql-database_free_space_low" { critical = "${var.diskspace_threshold_critical}" } + silenced = "${var.diskspace_silenced}" + notify_no_data = true evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -84,6 +88,8 @@ resource "datadog_monitor" "sql-database_dtu_consumption_high" { critical = "${var.dtu_threshold_critical}" } + silenced = "${var.dtu_silenced}" + notify_no_data = true evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -114,6 +120,8 @@ resource "datadog_monitor" "sql-database_deadlocks_count" { critical = "${var.deadlock_threshold_critical}" } + silenced = "${var.deadlock_silenced}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 0 diff --git a/cloud/azure/storage/README.md b/cloud/azure/storage/README.md index 7d72473..83e1dca 100644 --- a/cloud/azure/storage/README.md +++ b/cloud/azure/storage/README.md @@ -32,27 +32,36 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| +| authorization_error_requests_silenced | Groups to mute for Storage authorization errors monitor | map | `` | no | | authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `90` | no | | authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `50` | no | +| availability_silenced | Groups to mute for Storage availability monitor | map | `` | no | | availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `50` | no | | availability_threshold_warning | Warning regarding acceptable percent of availability for a storage | string | `90` | no | +| client_other_error_requests_silenced | Groups to mute for Storage other errors monitor | map | `` | no | | client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `90` | no | | client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `50` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| latency_silenced | Groups to mute for Storage latency monitor | map | `` | no | | latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `2000` | no | | latency_threshold_warning | Warning regarding acceptable end to end latency (ms) for a storage | string | `1000` | no | | message | Message sent when a Redis monitor is triggered | string | - | yes | +| network_error_requests_silenced | Groups to mute for Storage network errors monitor | map | `` | no | | network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `90` | no | | network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `50` | no | +| server_other_error_requests_silenced | Groups to mute for Storage server other errors monitor | map | `` | no | | server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `90` | no | | server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `50` | no | +| successful_requests_silenced | Groups to mute for Storage sucessful requests monitor | map | `` | no | | successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `10` | no | | successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `30` | no | +| throttling_error_requests_silenced | Groups to mute for Storage throttling error monitor | map | `` | no | | throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `90` | no | | throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `50` | no | +| timeout_error_requests_silenced | Groups to mute for Storage timeout monitor | map | `` | no | | timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `90` | no | | timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `50` | no | diff --git a/cloud/azure/storage/inputs.tf b/cloud/azure/storage/inputs.tf index 83468fd..f665a6e 100644 --- a/cloud/azure/storage/inputs.tf +++ b/cloud/azure/storage/inputs.tf @@ -25,6 +25,12 @@ variable "filter_tags_custom" { } # Azure Storage specific +variable "availability_silenced" { + description = "Groups to mute for Storage availability monitor" + type = "map" + default = {} +} + variable "availability_threshold_critical" { description = "Minimum acceptable percent of availability for a storage" default = 50 @@ -35,6 +41,12 @@ variable "availability_threshold_warning" { default = 90 } +variable "successful_requests_silenced" { + description = "Groups to mute for Storage sucessful requests monitor" + type = "map" + default = {} +} + variable "successful_requests_threshold_critical" { description = "Minimum acceptable percent of successful requests for a storage" default = 10 @@ -45,6 +57,12 @@ variable "successful_requests_threshold_warning" { default = 30 } +variable "latency_silenced" { + description = "Groups to mute for Storage latency monitor" + type = "map" + default = {} +} + variable "latency_threshold_critical" { description = "Maximum acceptable end to end latency (ms) for a storage" default = 2000 @@ -55,6 +73,12 @@ variable "latency_threshold_warning" { default = 1000 } +variable "timeout_error_requests_silenced" { + description = "Groups to mute for Storage timeout monitor" + type = "map" + default = {} +} + variable "timeout_error_requests_threshold_critical" { description = "Maximum acceptable percent of timeout error requests for a storage" default = 90 @@ -65,6 +89,12 @@ variable "timeout_error_requests_threshold_warning" { default = 50 } +variable "network_error_requests_silenced" { + description = "Groups to mute for Storage network errors monitor" + type = "map" + default = {} +} + variable "network_error_requests_threshold_critical" { description = "Maximum acceptable percent of network error requests for a storage" default = 90 @@ -75,6 +105,12 @@ variable "network_error_requests_threshold_warning" { default = 50 } +variable "throttling_error_requests_silenced" { + description = "Groups to mute for Storage throttling error monitor" + type = "map" + default = {} +} + variable "throttling_error_requests_threshold_critical" { description = "Maximum acceptable percent of throttling error requests for a storage" default = 90 @@ -85,6 +121,12 @@ variable "throttling_error_requests_threshold_warning" { default = 50 } +variable "server_other_error_requests_silenced" { + description = "Groups to mute for Storage server other errors monitor" + type = "map" + default = {} +} + variable "server_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of server other error requests for a storage" default = 90 @@ -95,6 +137,12 @@ variable "server_other_error_requests_threshold_warning" { default = 50 } +variable "client_other_error_requests_silenced" { + description = "Groups to mute for Storage other errors monitor" + type = "map" + default = {} +} + variable "client_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of client other error requests for a storage" default = 90 @@ -105,6 +153,12 @@ variable "client_other_error_requests_threshold_warning" { default = 50 } +variable "authorization_error_requests_silenced" { + description = "Groups to mute for Storage authorization errors monitor" + type = "map" + default = {} +} + variable "authorization_error_requests_threshold_critical" { description = "Maximum acceptable percent of authorization error requests for a storage" default = 90 diff --git a/cloud/azure/storage/monitors-azure-storage.tf b/cloud/azure/storage/monitors-azure-storage.tf index 2d639c8..ed25396 100644 --- a/cloud/azure/storage/monitors-azure-storage.tf +++ b/cloud/azure/storage/monitors-azure-storage.tf @@ -21,6 +21,8 @@ EOF warning = "${var.availability_threshold_warning}" } + silenced = "${var.availability_silenced}" + type = "metric alert" notify_no_data = false notify_audit = false @@ -51,6 +53,8 @@ EOF warning = "${var.successful_requests_threshold_warning}" } + silenced = "${var.successful_requests_silenced}" + type = "metric alert" notify_no_data = false notify_audit = false @@ -81,6 +85,8 @@ EOF warning = "${var.latency_threshold_warning}" } + silenced = "${var.latency_silenced}" + type = "metric alert" notify_no_data = false notify_audit = false @@ -111,6 +117,8 @@ EOF warning = "${var.timeout_error_requests_threshold_warning}" } + silenced = "${var.timeout_error_requests_silenced}" + type = "metric alert" notify_no_data = false notify_audit = false @@ -141,6 +149,8 @@ EOF warning = "${var.network_error_requests_threshold_warning}" } + silenced = "${var.network_error_requests_silenced}" + type = "metric alert" notify_no_data = false notify_audit = false @@ -171,6 +181,8 @@ EOF warning = "${var.throttling_error_requests_threshold_warning}" } + silenced = "${var.throttling_error_requests_silenced}" + type = "metric alert" notify_no_data = false notify_audit = false @@ -201,6 +213,8 @@ EOF warning = "${var.server_other_error_requests_threshold_warning}" } + silenced = "${var.server_other_error_requests_silenced}" + type = "metric alert" notify_no_data = false notify_audit = false @@ -231,6 +245,8 @@ EOF warning = "${var.client_other_error_requests_threshold_warning}" } + silenced = "${var.client_other_error_requests_silenced}" + type = "metric alert" notify_no_data = false notify_audit = false @@ -261,6 +277,8 @@ EOF warning = "${var.authorization_error_requests_threshold_warning}" } + silenced = "${var.authorization_error_requests_silenced}" + type = "metric alert" notify_no_data = false notify_audit = false diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md index 53422c8..05276ab 100644 --- a/cloud/azure/stream-analytics/README.md +++ b/cloud/azure/stream-analytics/README.md @@ -19,17 +19,22 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| +| conversion_errors_silenced | Groups to mute for Stream Analytics conversion errors monitor | map | `` | no | | conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no | | conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | +| failed_function_requests_silenced | Groups to mute for Stream Analytics failed requests monitor | map | `` | no | | failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no | +| failed_function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | -| function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | | message | Message sent when a Redis monitor is triggered | string | - | yes | +| runtime_errors_silenced | Groups to mute for Stream Analytics runtime errors monitor | map | `` | no | | runtime_errors_threshold_critical | Runtime errors limit (critical threshold) | string | `10` | no | | runtime_errors_threshold_warning | Runtime errors limit (warning threshold) | string | `0` | no | +| status_silenced | Groups to mute for Stream Analytics status monitor | map | `` | no | +| su_utilization_silenced | Groups to mute for Stream Analytics utilization monitor | map | `` | no | | su_utilization_threshold_critical | Streaming Unit utilization rate limit (critical threshold) | string | `80` | no | | su_utilization_threshold_warning | Streaming Unit utilization rate limit (warning threshold) | string | `60` | no | diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index ce3c713..d0e86fc 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -25,6 +25,18 @@ variable "filter_tags_custom" { } # Azure Stream Analytics specific +variable "status_silenced" { + description = "Groups to mute for Stream Analytics status monitor" + type = "map" + default = {} +} + +variable "su_utilization_silenced" { + description = "Groups to mute for Stream Analytics utilization monitor" + type = "map" + default = {} +} + variable "su_utilization_threshold_warning" { description = "Streaming Unit utilization rate limit (warning threshold)" default = 60 @@ -35,7 +47,13 @@ variable "su_utilization_threshold_critical" { default = 80 } -variable "function_requests_threshold_warning" { +variable "failed_function_requests_silenced" { + description = "Groups to mute for Stream Analytics failed requests monitor" + type = "map" + default = {} +} + +variable "failed_function_requests_threshold_warning" { description = "Failed Function Request rate limit (warning threshold)" default = 0 } @@ -45,6 +63,12 @@ variable "failed_function_requests_threshold_critical" { default = 10 } +variable "conversion_errors_silenced" { + description = "Groups to mute for Stream Analytics conversion errors monitor" + type = "map" + default = {} +} + variable "conversion_errors_threshold_warning" { description = "Conversion errors limit (warning threshold)" default = 0 @@ -55,6 +79,12 @@ variable "conversion_errors_threshold_critical" { default = 10 } +variable "runtime_errors_silenced" { + description = "Groups to mute for Stream Analytics runtime errors monitor" + type = "map" + default = {} +} + variable "runtime_errors_threshold_warning" { description = "Runtime errors limit (warning threshold)" default = 0 diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 40d4c6d..71068c2 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -16,6 +16,8 @@ resource "datadog_monitor" "status" { type = "metric alert" + silenced = "${var.status_silenced}" + notify_no_data = true evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -58,6 +60,8 @@ resource "datadog_monitor" "su_utilization" { critical = "${var.su_utilization_threshold_critical}" } + silenced = "${var.su_utilization_silenced}" + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] } @@ -86,10 +90,12 @@ resource "datadog_monitor" "failed_function_requests" { no_data_timeframe = 20 thresholds { - warning = "${var.function_requests_threshold_warning}" + warning = "${var.failed_function_requests_threshold_warning}" critical = "${var.failed_function_requests_threshold_critical}" } + silenced = "${var.failed_function_requests_silenced}" + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] } @@ -121,6 +127,8 @@ resource "datadog_monitor" "conversion_errors" { critical = "${var.conversion_errors_threshold_critical}" } + silenced = "${var.conversion_errors_silenced}" + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] } @@ -152,5 +160,7 @@ resource "datadog_monitor" "runtime_errors" { critical = "${var.runtime_errors_threshold_critical}" } + silenced = "${var.runtime_errors_silenced}" + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] }