diff --git a/cloud/azure/README.md b/cloud/azure/README.md index 05bc25d..3e65051 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -31,82 +31,144 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| apimanagement_failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no | -| apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no | -| apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no | -| apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no | -| appservices_http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `90` | no | -| appservices_http_successful_requests_threshold_warning | Minimum warning acceptable percent of 2xx & 3xx requests | string | `95` | no | -| appservices_http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no | -| appservices_http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no | -| appservices_http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no | -| appservices_http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no | -| appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | -| appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | -| appservices_response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | -| appservices_response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | +| apimanagement_failed_requests_silenced | Groups to mute for API Management failed requests monitor | map | `` | no | +| apimanagement_failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `90` | no | +| apimanagement_failed_requests_threshold_warning | Warning regarding acceptable percent of failed requests | string | `50` | no | +| apimanagement_other_requests_silenced | Groups to mute for API Management other requests monitor | map | `` | no | +| apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `90` | no | +| apimanagement_other_requests_threshold_warning | Warning regarding acceptable percent of other requests | string | `50` | no | +| apimanagement_status_silenced | Groups to mute for API Management status monitor | map | `` | no | +| apimanagement_successful_requests_silenced | Groups to mute for API Management successful requests monitor | map | `` | no | +| apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `10` | no | +| apimanagement_successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests | string | `30` | no | +| apimanagement_unauthorized_requests_silenced | Groups to mute for API Management unauthorized requests monitor | map | `` | no | +| apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `90` | no | +| apimanagement_unauthorized_requests_threshold_warning | Warning regarding acceptable percent of unauthorized requests | string | `50` | no | +| appservices_http_4xx_requests_silenced | Groups to mute for App Services 4xx requests monitor | map | `` | no | +| appservices_http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `90` | no | +| appservices_http_4xx_requests_threshold_warning | Warning regarding acceptable percent of 4xx errors | string | `50` | no | +| appservices_http_5xx_requests_silenced | Groups to mute for App Services 5xx requests monitor | map | `` | no | +| appservices_http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `90` | no | +| appservices_http_5xx_requests_threshold_warning | Warning regarding acceptable percent of 5xx errors | string | `50` | no | +| appservices_http_successful_requests_silenced | Groups to mute for App Services successful requests monitor | map | `` | no | +| appservices_http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `10` | no | +| appservices_http_successful_requests_threshold_warning | Warning regarding acceptable percent of 2xx & 3xx requests | string | `30` | no | +| appservices_memory_usage_silenced | Groups to mute for App Services memory usage monitor | map | `` | no | +| appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `1073741824` | no | +| appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `536870912` | no | +| appservices_response_time_silenced | Groups to mute for App Services response time monitor | map | `` | no | +| appservices_response_time_threshold_critical | Alerting threshold for response time in seconds | string | `10` | no | +| appservices_response_time_threshold_warning | Warning threshold for response time in seconds | string | `5` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| eventhub_errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no | -| eventhub_errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no | -| eventhub_failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no | -| eventhub_failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no | +| eventhub_errors_rate_silenced | Groups to mute for Event Hub errors monitor | map | `` | no | +| eventhub_errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `90` | no | +| eventhub_errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `50` | no | +| eventhub_failed_requests_rate_silenced | Groups to mute for Event Hub failed requests monitor | map | `` | no | +| eventhub_failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `90` | no | +| eventhub_failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `50` | no | +| eventhub_status_silenced | Groups to mute for Event Hub status monitor | map | `` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| iothub_dropped_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub dropped d2c telemetry monitor | map | `` | no | | iothub_dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no | | iothub_dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no | -| iothub_failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no | -| iothub_failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | -| iothub_failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | -| iothub_failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no | -| iothub_failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no | -| iothub_failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no | -| iothub_failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | -| iothub_failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | -| iothub_failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_c2d_methods_rate_silenced | Groups to mute for IoT Hub failed c2d methods monitor | map | `` | no | +| iothub_failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_c2d_twin_read_rate_silenced | Groups to mute for IoT Hub failed c2d twin read monitor | map | `` | no | +| iothub_failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_c2d_twin_update_rate_silenced | Groups to mute for IoT Hub failed c2d twin update monitor | map | `` | no | +| iothub_failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_d2c_twin_read_rate_silenced | Groups to mute for IoT Hub failed d2c twin read monitor | map | `` | no | +| iothub_failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_d2c_twin_update_rate_silenced | Groups to mute for IoT Hub failed d2c twin update monitor | map | `` | no | +| iothub_failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_jobs_rate_silenced | Groups to mute for IoT Hub failed jobs monitor | map | `` | no | +| iothub_failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_listjobs_rate_silenced | Groups to mute for IoT Hub failed list jobs monitor | map | `` | no | +| iothub_failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `50` | no | +| iothub_failed_queryjobs_rate_silenced | Groups to mute for IoT Hub failed query jobs monitor | map | `` | no | +| iothub_failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `90` | no | +| iothub_failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no | +| iothub_fallback_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub fallback d2c telemetry monitor | map | `` | no | | iothub_fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no | | iothub_fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no | +| iothub_invalid_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub invalid d2c telemetry monitor | map | `` | no | | iothub_invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no | | iothub_invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no | +| iothub_orphaned_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub orphaned d2c telemetry monitor | map | `` | no | | iothub_orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no | | iothub_orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no | +| iothub_status_silenced | Groups to mute for IoT Hub status monitor | map | `` | no | +| iothub_too_many_d2c_telemetry_ingress_nosent_silenced | Groups to mute for IoT Hub unsent d2c telemetry monitor | map | `` | no | +| iothub_total_devices_silenced | Groups to mute for IoT Hub total device monitor | map | `` | no | | message | Message sent when a monitor is triggered | string | - | yes | | non_taggable_filter_tags | Tags used for filtering for components without tag support | string | `*` | no | +| redis_evictedkeys_limit_silenced | Groups to mute for Redis evicted keys monitor | map | `` | no | | redis_evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no | | redis_evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no | +| redis_percent_processor_time_silenced | Groups to mute for Redis processor monitor | map | `` | no | | redis_percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no | | redis_percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no | +| redis_server_load_rate_silenced | Groups to mute for Redis server load monitor | map | `` | no | | redis_server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no | | redis_server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no | +| sqldatabase_cpu_silenced | Groups to mute for SQL CPU monitor | map | `` | no | | sqldatabase_cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | | sqldatabase_cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no | +| sqldatabase_deadlock_silenced | Groups to mute for SQL Deadlock monitor | map | `` | no | | sqldatabase_deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no | +| sqldatabase_diskspace_silenced | Groups to mute for SQL disk space monitor | map | `` | no | | sqldatabase_diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no | | sqldatabase_diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no | +| sqldatabase_dtu_silenced | Groups to mute for SQL DTU monitor | map | `` | no | | sqldatabase_dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no | | sqldatabase_dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no | -| storage_authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `15` | no | -| storage_availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `90` | no | -| storage_client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `15` | no | -| storage_latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `1000` | no | -| storage_network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `5` | no | -| storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `10` | no | -| storage_successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `90` | no | -| storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `10` | no | -| storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `5` | no | +| storage_authorization_error_requests_silenced | Groups to mute for Storage authorization errors monitor | map | `` | no | +| storage_authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `90` | no | +| storage_authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `50` | no | +| storage_availability_silenced | Groups to mute for Storage availability monitor | map | `` | no | +| storage_availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `50` | no | +| storage_availability_threshold_warning | Warning regarding acceptable percent of availability for a storage | string | `90` | no | +| storage_client_other_error_requests_silenced | Groups to mute for Storage other errors monitor | map | `` | no | +| storage_client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `90` | no | +| storage_client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `50` | no | +| storage_latency_silenced | Groups to mute for Storage latency monitor | map | `` | no | +| storage_latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `2000` | no | +| storage_latency_threshold_warning | Warning regarding acceptable end to end latency (ms) for a storage | string | `1000` | no | +| storage_network_error_requests_silenced | Groups to mute for Storage network errors monitor | map | `` | no | +| storage_network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `90` | no | +| storage_network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `50` | no | +| storage_server_other_error_requests_silenced | Groups to mute for Storage server other errors monitor | map | `` | no | +| storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `90` | no | +| storage_server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `50` | no | +| storage_successful_requests_silenced | Groups to mute for Storage sucessful requests monitor | map | `` | no | +| storage_successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `10` | no | +| storage_successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `30` | no | +| storage_throttling_error_requests_silenced | Groups to mute for Storage throttling error monitor | map | `` | no | +| storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `90` | no | +| storage_throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `50` | no | +| storage_timeout_error_requests_silenced | Groups to mute for Storage timeout monitor | map | `` | no | +| storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `90` | no | +| storage_timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `50` | no | +| streamanalytics_conversion_errors_silenced | Groups to mute for Stream Analytics conversion errors monitor | map | `` | no | | streamanalytics_conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no | | streamanalytics_conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no | +| streamanalytics_failed_function_requests_silenced | Groups to mute for Stream Analytics failed requests monitor | map | `` | no | | streamanalytics_failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no | -| streamanalytics_function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | +| streamanalytics_failed_function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | +| streamanalytics_runtime_errors_silenced | Groups to mute for Stream Analytics runtime errors monitor | map | `` | no | | streamanalytics_runtime_errors_threshold_critical | Runtime errors limit (critical threshold) | string | `10` | no | | streamanalytics_runtime_errors_threshold_warning | Runtime errors limit (warning threshold) | string | `0` | no | +| streamanalytics_status_silenced | Groups to mute for Stream Analytics status monitor | map | `` | no | +| streamanalytics_su_utilization_silenced | Groups to mute for Stream Analytics utilization monitor | map | `` | no | | streamanalytics_su_utilization_threshold_critical | Streaming Unit utilization rate limit (critical threshold) | string | `80` | no | | streamanalytics_su_utilization_threshold_warning | Streaming Unit utilization rate limit (warning threshold) | string | `60` | no | diff --git a/cloud/azure/apimanagement/README.md b/cloud/azure/apimanagement/README.md index e59e81a..82de287 100644 --- a/cloud/azure/apimanagement/README.md +++ b/cloud/azure/apimanagement/README.md @@ -29,13 +29,22 @@ Inputs |------|-------------|:----:|:-----:|:-----:| | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no | +| failed_requests_silenced | Groups to mute for API Management failed requests monitor | map | `` | no | +| failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `90` | no | +| failed_requests_threshold_warning | Warning regarding acceptable percent of failed requests | string | `50` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when a Redis monitor is triggered | string | - | yes | -| other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no | -| successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no | -| unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no | +| other_requests_silenced | Groups to mute for API Management other requests monitor | map | `` | no | +| other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `90` | no | +| other_requests_threshold_warning | Warning regarding acceptable percent of other requests | string | `50` | no | +| status_silenced | Groups to mute for API Management status monitor | map | `` | no | +| successful_requests_silenced | Groups to mute for API Management successful requests monitor | map | `` | no | +| successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `10` | no | +| successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests | string | `30` | no | +| unauthorized_requests_silenced | Groups to mute for API Management unauthorized requests monitor | map | `` | no | +| unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `90` | no | +| unauthorized_requests_threshold_warning | Warning regarding acceptable percent of unauthorized requests | string | `50` | no | Related documentation --------------------- diff --git a/cloud/azure/apimanagement/inputs.tf b/cloud/azure/apimanagement/inputs.tf index 7d04b46..74273b2 100644 --- a/cloud/azure/apimanagement/inputs.tf +++ b/cloud/azure/apimanagement/inputs.tf @@ -25,22 +25,72 @@ variable "filter_tags_custom" { } # Azure API Management specific +variable "status_silenced" { + description = "Groups to mute for API Management status monitor" + type = "map" + default = {} +} + +variable "failed_requests_silenced" { + description = "Groups to mute for API Management failed requests monitor" + type = "map" + default = {} +} + variable "failed_requests_threshold_critical" { description = "Maximum acceptable percent of failed requests" - default = 5 + default = 90 +} + +variable "failed_requests_threshold_warning" { + description = "Warning regarding acceptable percent of failed requests" + default = 50 +} + +variable "other_requests_silenced" { + description = "Groups to mute for API Management other requests monitor" + type = "map" + default = {} } variable "other_requests_threshold_critical" { description = "Maximum acceptable percent of other requests" - default = 5 + default = 90 +} + +variable "other_requests_threshold_warning" { + description = "Warning regarding acceptable percent of other requests" + default = 50 +} + +variable "unauthorized_requests_silenced" { + description = "Groups to mute for API Management unauthorized requests monitor" + type = "map" + default = {} } variable "unauthorized_requests_threshold_critical" { description = "Maximum acceptable percent of unauthorized requests" - default = 5 + default = 90 +} + +variable "unauthorized_requests_threshold_warning" { + description = "Warning regarding acceptable percent of unauthorized requests" + default = 50 +} + +variable "successful_requests_silenced" { + description = "Groups to mute for API Management successful requests monitor" + type = "map" + default = {} } variable "successful_requests_threshold_critical" { description = "Minimum acceptable percent of successful requests" - default = 90 + default = 10 +} + +variable "successful_requests_threshold_warning" { + description = "Warning regarding acceptable percent of successful requests" + default = 30 } diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf index 394812b..5a17080 100644 --- a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf +++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf @@ -9,7 +9,7 @@ data "template_file" "filter" { } resource "datadog_monitor" "apimgt_status" { - name = "[${var.environment}] API Management status is not ok on {{name}}" + name = "[${var.environment}] API Management is down" message = "${var.message}" query = <` | no | +| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `90` | no | +| http_4xx_requests_threshold_warning | Warning regarding acceptable percent of 4xx errors | string | `50` | no | +| http_5xx_requests_silenced | Groups to mute for App Services 5xx requests monitor | map | `` | no | +| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `90` | no | +| http_5xx_requests_threshold_warning | Warning regarding acceptable percent of 5xx errors | string | `50` | no | +| http_successful_requests_silenced | Groups to mute for App Services successful requests monitor | map | `` | no | +| http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `10` | no | +| http_successful_requests_threshold_warning | Warning regarding acceptable percent of 2xx & 3xx requests | string | `30` | no | +| memory_usage_silenced | Groups to mute for App Services memory usage monitor | map | `` | no | +| memory_usage_threshold_critical | Alerting threshold in Mib | string | `1073741824` | no | +| memory_usage_threshold_warning | Warning threshold in MiB | string | `536870912` | no | | message | Message sent when a monitor is triggered | string | - | yes | -| response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | -| response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | +| response_time_silenced | Groups to mute for App Services response time monitor | map | `` | no | +| response_time_threshold_critical | Alerting threshold for response time in seconds | string | `10` | no | +| response_time_threshold_warning | Warning threshold for response time in seconds | string | `5` | no | Related documentation --------------------- diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index ab9ea74..8ed4216 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -22,72 +22,82 @@ variable "delay" { default = 600 } -################################### -### RESPONSE TIME VARIABLES ### -################################### +variable "response_time_silenced" { + description = "Groups to mute for App Services response time monitor" + type = "map" + default = {} +} variable "response_time_threshold_critical" { - default = 0.8 - description = "Alerting threshold in seconds" + default = 10 + description = "Alerting threshold for response time in seconds" } variable "response_time_threshold_warning" { - default = 0.4 - description = "Warning threshold in seconds" + default = 5 + description = "Warning threshold for response time in seconds" } -################################### -### MEMORY USAGE VARIABLES ### -################################### +variable "memory_usage_silenced" { + description = "Groups to mute for App Services memory usage monitor" + type = "map" + default = {} +} variable "memory_usage_threshold_critical" { - default = 52430000 + default = 1073741824 # 1Gb description = "Alerting threshold in Mib" } variable "memory_usage_threshold_warning" { - default = 33550000 + default = 536870912 # 512Mb description = "Warning threshold in MiB" } -################################# -### HTTP 5xx status pages ### -################################# - -variable "http_5xx_requests_threshold_critical" { - default = 20 - description = "Maximum critical acceptable percent of 5xx errors" +variable "http_4xx_requests_silenced" { + description = "Groups to mute for App Services 4xx requests monitor" + type = "map" + default = {} } -variable "http_5xx_requests_threshold_warning" { - default = 10 - description = "Maximum warning acceptable percent of 5xx errors" -} - -################################# -### HTTP 4xx status pages ### -################################# - variable "http_4xx_requests_threshold_critical" { - default = 30 + default = 90 description = "Maximum critical acceptable percent of 4xx errors" } variable "http_4xx_requests_threshold_warning" { - default = 15 - description = "Maximum warning acceptable percent of 4xx errors" + default = 50 + description = "Warning regarding acceptable percent of 4xx errors" } -################################# -### HTTP 2xx status pages ### -################################# +variable "http_5xx_requests_silenced" { + description = "Groups to mute for App Services 5xx requests monitor" + type = "map" + default = {} +} + +variable "http_5xx_requests_threshold_critical" { + default = 90 + description = "Maximum critical acceptable percent of 5xx errors" +} + +variable "http_5xx_requests_threshold_warning" { + default = 50 + description = "Warning regarding acceptable percent of 5xx errors" +} + +variable "http_successful_requests_silenced" { + description = "Groups to mute for App Services successful requests monitor" + type = "map" + default = {} +} variable "http_successful_requests_threshold_critical" { - default = 90 + default = 10 description = "Minimum critical acceptable percent of 2xx & 3xx requests" } variable "http_successful_requests_threshold_warning" { - default = 95 - description = "Minimum warning acceptable percent of 2xx & 3xx requests" + default = 30 + description = "Warning regarding acceptable percent of 2xx & 3xx requests" } diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 342ae5e..62a68a8 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -8,7 +8,7 @@ data "template_file" "filter" { # Monitoring App Services response time resource "datadog_monitor" "appservices_response_time" { - name = "[${var.environment}] App Services response time of {{value}}s is to high on {{name}}" + name = "[${var.environment}] App Services response time too high {{comparator}} {{#is_alert}}{{threshold}}s{{/is_alert}}{{#is_warning}}{{warn_threshold}}s{{/is_warning}} ({{value}}s)" type = "metric alert" message = "${var.message}" @@ -26,9 +26,11 @@ resource "datadog_monitor" "appservices_response_time" { critical = "${var.response_time_threshold_critical}" } - notify_no_data = true # Will notify when no data is received + silenced = "${var.response_time_silenced}" + + notify_no_data = true # Will notify when no data is received renotify_interval = 0 - require_full_window = true + require_full_window = false timeout_h = 0 include_tags = true @@ -37,7 +39,7 @@ resource "datadog_monitor" "appservices_response_time" { # Monitoring App Services memory usage resource "datadog_monitor" "appservices_memory_usage_count" { - name = "[${var.environment}] App Services memory usage > ${ceil(var.memory_usage_threshold_critical/1000000)}MiB on {{name}}" + name = "[${var.environment}] App Services memory usage {{comparator}} {{#is_alert}}{{threshold}}{{/is_alert}}{{#is_warning}}{{warn_threshold}}{{/is_warning}} ({{value}})" type = "metric alert" message = "${var.message}" @@ -55,9 +57,11 @@ resource "datadog_monitor" "appservices_memory_usage_count" { critical = "${var.memory_usage_threshold_critical}" } - notify_no_data = true # Will notify when no data is received + silenced = "${var.memory_usage_silenced}" + + notify_no_data = true # Will notify when no data is received renotify_interval = 0 - require_full_window = true + require_full_window = false timeout_h = 0 include_tags = true @@ -66,7 +70,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" { # Monitoring App Services 5xx errors percent resource "datadog_monitor" "appservices_http_5xx_errors_count" { - name = "[${var.environment}] App Services HTTP 5xx errors is {{value}}% above the limit on {{name}}" + name = "[${var.environment}] App Services HTTP 5xx errors too high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" type = "metric alert" message = "${var.message}" @@ -85,9 +89,11 @@ resource "datadog_monitor" "appservices_http_5xx_errors_count" { critical = "${var.http_5xx_requests_threshold_critical}" } + silenced = "${var.http_5xx_requests_silenced}" + notify_no_data = false # Will NOT notify when no data is received renotify_interval = 0 - require_full_window = true + require_full_window = false timeout_h = 1 include_tags = true @@ -96,7 +102,7 @@ resource "datadog_monitor" "appservices_http_5xx_errors_count" { # Monitoring App Services 4xx errors percent resource "datadog_monitor" "appservices_http_4xx_errors_count" { - name = "[${var.environment}] App Services HTTP 4xx errors is {{value}}% above the limit on {{name}}" + name = "[${var.environment}] App Services HTTP 4xx errors too high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" type = "metric alert" message = "${var.message}" @@ -115,9 +121,11 @@ resource "datadog_monitor" "appservices_http_4xx_errors_count" { critical = "${var.http_4xx_requests_threshold_critical}" } + silenced = "${var.http_4xx_requests_silenced}" + notify_no_data = false # Will NOT notify when no data is received renotify_interval = 0 - require_full_window = true + require_full_window = false timeout_h = 1 include_tags = true @@ -126,7 +134,7 @@ resource "datadog_monitor" "appservices_http_4xx_errors_count" { # Monitoring App Services HTTP 2xx & 3xx status pages percent resource "datadog_monitor" "appservices_http_success_status_rate" { - name = "[${var.environment}] App Services HTTP successful responses is {{value}}% below the limit on {{name}}" + name = "[${var.environment}] App Services HTTP successful responses too low {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" type = "metric alert" message = "${var.message}" @@ -146,9 +154,11 @@ resource "datadog_monitor" "appservices_http_success_status_rate" { critical = "${var.http_successful_requests_threshold_critical}" } + silenced = "${var.http_successful_requests_silenced}" + notify_no_data = false # Will notify when no data is received renotify_interval = 0 - require_full_window = true + require_full_window = false timeout_h = 1 include_tags = true diff --git a/cloud/azure/eventhub/README.md b/cloud/azure/eventhub/README.md index b2573da..7bfc5f6 100644 --- a/cloud/azure/eventhub/README.md +++ b/cloud/azure/eventhub/README.md @@ -29,22 +29,16 @@ Inputs |------|-------------|:----:|:-----:|:-----:| | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no | -| errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no | -| failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no | -| failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no | +| errors_rate_silenced | Groups to mute for Event Hub errors monitor | map | `` | no | +| errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `90` | no | +| errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `50` | no | +| failed_requests_rate_silenced | Groups to mute for Event Hub failed requests monitor | map | `` | no | +| failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `90` | no | +| failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `50` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when an alert is triggered | string | - | yes | - -Outputs -------- - -| Name | Description | -|------|-------------| -| errors_monitor_id | Id of the `errors` monitor | -| failed_requests_monitor_id | Id of the `failed requests` monitor | -| status_monitor_id | Id of the `status` monitor | +| status_silenced | Groups to mute for Event Hub status monitor | map | `` | no | Related documentation --------------------- diff --git a/cloud/azure/eventhub/inputs.tf b/cloud/azure/eventhub/inputs.tf index 5cf007a..3d60a29 100644 --- a/cloud/azure/eventhub/inputs.tf +++ b/cloud/azure/eventhub/inputs.tf @@ -24,22 +24,40 @@ variable "filter_tags_custom" { default = "*" } +variable "status_silenced" { + description = "Groups to mute for Event Hub status monitor" + type = "map" + default = {} +} + +variable "failed_requests_rate_silenced" { + description = "Groups to mute for Event Hub failed requests monitor" + type = "map" + default = {} +} + variable "failed_requests_rate_thresold_critical" { description = "Failed requests ratio (percentage) to trigger the critical alert" - default = 3 + default = 90 } variable "failed_requests_rate_thresold_warning" { description = "Failed requests ratio (percentage) to trigger a warning alert" - default = 1 + default = 50 +} + +variable "errors_rate_silenced" { + description = "Groups to mute for Event Hub errors monitor" + type = "map" + default = {} } variable "errors_rate_thresold_critical" { description = "Errors ratio (percentage) to trigger the critical alert" - default = 3 + default = 90 } variable "errors_rate_thresold_warning" { description = "Errors ratio (percentage) to trigger a warning alert" - default = 1 + default = 50 } diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf index b4914e2..0d93b95 100644 --- a/cloud/azure/eventhub/monitors-eventhub.tf +++ b/cloud/azure/eventhub/monitors-eventhub.tf @@ -7,7 +7,7 @@ data "template_file" "filter" { } resource "datadog_monitor" "eventhub_status" { - name = "[${var.environment}] Event Hub status is not ok on {{name}}" + name = "[${var.environment}] Event Hub is down" message = "${var.message}" query = <` | no | | dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no | | dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no | | environment | Architecture Environment | string | - | yes | -| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no | -| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no | -| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no | -| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | -| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | -| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | -| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no | -| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no | -| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no | -| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no | -| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no | -| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no | -| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | -| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | -| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | -| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | +| failed_c2d_methods_rate_silenced | Groups to mute for IoT Hub failed c2d methods monitor | map | `` | no | +| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `90` | no | +| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `50` | no | +| failed_c2d_twin_read_rate_silenced | Groups to mute for IoT Hub failed c2d twin read monitor | map | `` | no | +| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `90` | no | +| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `50` | no | +| failed_c2d_twin_update_rate_silenced | Groups to mute for IoT Hub failed c2d twin update monitor | map | `` | no | +| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `90` | no | +| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `50` | no | +| failed_d2c_twin_read_rate_silenced | Groups to mute for IoT Hub failed d2c twin read monitor | map | `` | no | +| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `90` | no | +| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `50` | no | +| failed_d2c_twin_update_rate_silenced | Groups to mute for IoT Hub failed d2c twin update monitor | map | `` | no | +| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `90` | no | +| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `50` | no | +| failed_jobs_rate_silenced | Groups to mute for IoT Hub failed jobs monitor | map | `` | no | +| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `90` | no | +| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `50` | no | +| failed_listjobs_rate_silenced | Groups to mute for IoT Hub failed list jobs monitor | map | `` | no | +| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `90` | no | +| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `50` | no | +| failed_queryjobs_rate_silenced | Groups to mute for IoT Hub failed query jobs monitor | map | `` | no | +| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `90` | no | +| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no | +| fallback_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub fallback d2c telemetry monitor | map | `` | no | | fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no | | fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no | -| filter_tags | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags | Tags used for filtering | string | `*` | no | +| invalid_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub invalid d2c telemetry monitor | map | `` | no | | invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no | | invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no | | message | Message sent when an alert is triggered | string | - | yes | +| orphaned_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub orphaned d2c telemetry monitor | map | `` | no | | orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no | | orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no | +| status_silenced | Groups to mute for IoT Hub status monitor | map | `` | no | +| too_many_d2c_telemetry_ingress_nosent_silenced | Groups to mute for IoT Hub unsent d2c telemetry monitor | map | `` | no | +| total_devices_silenced | Groups to mute for IoT Hub total device monitor | map | `` | no | Related documentation --------------------- diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 1eb0d0d..2eaaefc 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -20,84 +20,156 @@ variable "filter_tags" { } # Azure IOT hubs specific +variable "status_silenced" { + description = "Groups to mute for IoT Hub status monitor" + type = "map" + default = {} +} + +variable "total_devices_silenced" { + description = "Groups to mute for IoT Hub total device monitor" + type = "map" + default = {} +} + +variable "too_many_d2c_telemetry_ingress_nosent_silenced" { + description = "Groups to mute for IoT Hub unsent d2c telemetry monitor" + type = "map" + default = {} +} + +variable "failed_jobs_rate_silenced" { + description = "Groups to mute for IoT Hub failed jobs monitor" + type = "map" + default = {} +} + variable "failed_jobs_rate_threshold_warning" { description = "Jobs Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_jobs_rate_threshold_critical" { description = "Jobs Failed rate limit (critical threshold)" - default = 10 + default = 90 +} + +variable "failed_listjobs_rate_silenced" { + description = "Groups to mute for IoT Hub failed list jobs monitor" + type = "map" + default = {} } variable "failed_listjobs_rate_threshold_warning" { description = "ListJobs Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_listjobs_rate_threshold_critical" { description = "ListJobs Failed rate limit (critical threshold)" - default = 10 + default = 90 +} + +variable "failed_queryjobs_rate_silenced" { + description = "Groups to mute for IoT Hub failed query jobs monitor" + type = "map" + default = {} } variable "failed_queryjobs_rate_threshold_warning" { description = "QueryJobs Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_queryjobs_rate_threshold_critical" { description = "QueryJobs Failed rate limit (critical threshold)" - default = 10 + default = 90 +} + +variable "failed_c2d_methods_rate_silenced" { + description = "Groups to mute for IoT Hub failed c2d methods monitor" + type = "map" + default = {} } variable "failed_c2d_methods_rate_threshold_warning" { description = "C2D Methods Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_c2d_methods_rate_threshold_critical" { description = "C2D Methods Failed rate limit (critical threshold)" - default = 10 + default = 90 +} + +variable "failed_c2d_twin_read_rate_silenced" { + description = "Groups to mute for IoT Hub failed c2d twin read monitor" + type = "map" + default = {} } variable "failed_c2d_twin_read_rate_threshold_warning" { description = "C2D Twin Read Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_c2d_twin_read_rate_threshold_critical" { description = "C2D Twin Read Failed rate limit (critical threshold)" - default = 10 + default = 90 +} + +variable "failed_c2d_twin_update_rate_silenced" { + description = "Groups to mute for IoT Hub failed c2d twin update monitor" + type = "map" + default = {} } variable "failed_c2d_twin_update_rate_threshold_warning" { description = "C2D Twin Update Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_c2d_twin_update_rate_threshold_critical" { description = "C2D Twin Update Failed rate limit (critical threshold)" - default = 10 + default = 90 +} + +variable "failed_d2c_twin_read_rate_silenced" { + description = "Groups to mute for IoT Hub failed d2c twin read monitor" + type = "map" + default = {} } variable "failed_d2c_twin_read_rate_threshold_warning" { description = "D2C Twin Read Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_d2c_twin_read_rate_threshold_critical" { description = "D2C Twin Read Failed rate limit (critical threshold)" - default = 10 + default = 90 +} + +variable "failed_d2c_twin_update_rate_silenced" { + description = "Groups to mute for IoT Hub failed d2c twin update monitor" + type = "map" + default = {} } variable "failed_d2c_twin_update_rate_threshold_warning" { description = "D2C Twin Update Failed rate limit (warning threshold)" - default = 0 + default = 50 } variable "failed_d2c_twin_update_rate_threshold_critical" { description = "D2C Twin Update Failed rate limit (critical threshold)" - default = 10 + default = 90 +} + +variable "dropped_d2c_telemetry_egress_silenced" { + description = "Groups to mute for IoT Hub dropped d2c telemetry monitor" + type = "map" + default = {} } variable "dropped_d2c_telemetry_egress_threshold_warning" { @@ -110,6 +182,12 @@ variable "dropped_d2c_telemetry_egress_threshold_critical" { default = 1000 } +variable "orphaned_d2c_telemetry_egress_silenced" { + description = "Groups to mute for IoT Hub orphaned d2c telemetry monitor" + type = "map" + default = {} +} + variable "orphaned_d2c_telemetry_egress_threshold_warning" { description = "D2C Telemetry Orphaned limit (warning threshold)" default = 500 @@ -120,6 +198,12 @@ variable "orphaned_d2c_telemetry_egress_threshold_critical" { default = 1000 } +variable "invalid_d2c_telemetry_egress_silenced" { + description = "Groups to mute for IoT Hub invalid d2c telemetry monitor" + type = "map" + default = {} +} + variable "invalid_d2c_telemetry_egress_threshold_warning" { description = "D2C Telemetry Invalid limit (warning threshold)" default = 500 @@ -130,6 +214,12 @@ variable "invalid_d2c_telemetry_egress_threshold_critical" { default = 1000 } +variable "fallback_d2c_telemetry_egress_silenced" { + description = "Groups to mute for IoT Hub fallback d2c telemetry monitor" + type = "map" + default = {} +} + variable "fallback_d2c_telemetry_egress_threshold_warning" { description = "D2C Telemetry Fallback limit (warning threshold)" default = 500 diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index e1e4e63..35cc12b 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -1,5 +1,5 @@ resource "datadog_monitor" "too_many_jobs_failed" { - name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}" + name = "[${var.environment}] IOT Hub Too many jobs failed {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" message = "${var.message}" query = <` | no | | evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no | | evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when a Redis monitor is triggered | string | - | yes | +| percent_processor_time_silenced | Groups to mute for Redis processor monitor | map | `` | no | | percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no | | percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no | +| server_load_rate_silenced | Groups to mute for Redis server load monitor | map | `` | no | | server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no | | server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no | diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf index 18f0448..79a8592 100644 --- a/cloud/azure/redis/inputs.tf +++ b/cloud/azure/redis/inputs.tf @@ -25,6 +25,18 @@ variable "filter_tags_custom" { } # Azure Redis specific +variable "status_silenced" { + description = "Groups to mute for Redis status monitor" + type = "map" + default = {} +} + +variable "evictedkeys_limit_silenced" { + description = "Groups to mute for Redis evicted keys monitor" + type = "map" + default = {} +} + variable "evictedkeys_limit_threshold_warning" { description = "Evicted keys limit (warning threshold)" default = 0 @@ -35,6 +47,12 @@ variable "evictedkeys_limit_threshold_critical" { default = 100 } +variable "percent_processor_time_silenced" { + description = "Groups to mute for Redis processor monitor" + type = "map" + default = {} +} + variable "percent_processor_time_threshold_critical" { description = "Processor time percent (critical threshold)" default = 80 @@ -45,6 +63,12 @@ variable "percent_processor_time_threshold_warning" { default = 60 } +variable "server_load_rate_silenced" { + description = "Groups to mute for Redis server load monitor" + type = "map" + default = {} +} + variable "server_load_rate_threshold_critical" { description = "Server CPU load rate (critical threshold)" default = 90 diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf index 8e68558..7b9ad62 100644 --- a/cloud/azure/redis/monitors-azure-redis.tf +++ b/cloud/azure/redis/monitors-azure-redis.tf @@ -16,6 +16,8 @@ EOF type = "metric alert" + silenced = "${var.status_silenced}" + notify_no_data = true evaluation_delay = "${var.delay}" renotify_interval = 0 @@ -23,7 +25,7 @@ EOF timeout_h = 0 include_tags = true locked = false - require_full_window = true + require_full_window = false new_host_delay = "${var.delay}" no_data_timeframe = 20 @@ -31,7 +33,7 @@ EOF } resource "datadog_monitor" "evictedkeys" { - name = "[${var.environment}] Redis {{value}} evictedkeys on {{name}}" + name = "[${var.environment}] Redis too many evictedkeys {{comparator}} {{#is_alert}}{{threshold}}{{/is_alert}}{{#is_warning}}{{warn_threshold}}{{/is_warning}} ({{value}})" message = "${var.message}" query = <` | no | | cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | | cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no | +| deadlock_silenced | Groups to mute for SQL Deadlock monitor | map | `` | no | | deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | +| diskspace_silenced | Groups to mute for SQL disk space monitor | map | `` | no | | diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no | | diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no | +| dtu_silenced | Groups to mute for SQL DTU monitor | map | `` | no | | dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no | | dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no | | environment | Architecture Environment | string | - | yes | diff --git a/cloud/azure/sql-database/inputs.tf b/cloud/azure/sql-database/inputs.tf index aa81cfb..58be3d4 100644 --- a/cloud/azure/sql-database/inputs.tf +++ b/cloud/azure/sql-database/inputs.tf @@ -25,6 +25,11 @@ variable "filter_tags_custom" { } # Azure SQL Database specific +variable "cpu_silenced" { + description = "Groups to mute for SQL CPU monitor" + type = "map" + default = {} +} variable "cpu_threshold_warning" { description = "CPU usage in percent (warning threshold)" @@ -36,6 +41,12 @@ variable "cpu_threshold_critical" { default = "90" } +variable "diskspace_silenced" { + description = "Groups to mute for SQL disk space monitor" + type = "map" + default = {} +} + variable "diskspace_threshold_warning" { description = "Disk space used in percent (warning threshold)" default = "80" @@ -46,6 +57,12 @@ variable "diskspace_threshold_critical" { default = "90" } +variable "dtu_silenced" { + description = "Groups to mute for SQL DTU monitor" + type = "map" + default = {} +} + variable "dtu_threshold_warning" { description = "Amount of DTU used (warning threshold)" default = "85" @@ -56,6 +73,12 @@ variable "dtu_threshold_critical" { default = "90" } +variable "deadlock_silenced" { + description = "Groups to mute for SQL Deadlock monitor" + type = "map" + default = {} +} + variable "deadlock_threshold_critical" { description = "Amount of Deadlocks (critical threshold)" default = "1" diff --git a/cloud/azure/sql-database/monitors-sql-database-basics.tf b/cloud/azure/sql-database/monitors-sql-database-basics.tf index b013605..6df1cd3 100644 --- a/cloud/azure/sql-database/monitors-sql-database-basics.tf +++ b/cloud/azure/sql-database/monitors-sql-database-basics.tf @@ -7,7 +7,7 @@ data "template_file" "filter" { } resource "datadog_monitor" "sql-database_cpu_90_15min" { - name = "[${var.environment}] SQL Database CPU high > ${var.cpu_threshold_critical}% on {{name}}" + name = "[${var.environment}] SQL Database CPU too high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" message = "${var.message}" query = <` | no | +| authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `90` | no | +| authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `50` | no | +| availability_silenced | Groups to mute for Storage availability monitor | map | `` | no | +| availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `50` | no | +| availability_threshold_warning | Warning regarding acceptable percent of availability for a storage | string | `90` | no | +| client_other_error_requests_silenced | Groups to mute for Storage other errors monitor | map | `` | no | +| client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `90` | no | +| client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `50` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | -| latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `1000` | no | +| latency_silenced | Groups to mute for Storage latency monitor | map | `` | no | +| latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `2000` | no | +| latency_threshold_warning | Warning regarding acceptable end to end latency (ms) for a storage | string | `1000` | no | | message | Message sent when a Redis monitor is triggered | string | - | yes | -| network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `5` | no | -| server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `10` | no | -| successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `90` | no | -| throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `10` | no | -| timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `5` | no | +| network_error_requests_silenced | Groups to mute for Storage network errors monitor | map | `` | no | +| network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `90` | no | +| network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `50` | no | +| server_other_error_requests_silenced | Groups to mute for Storage server other errors monitor | map | `` | no | +| server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `90` | no | +| server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `50` | no | +| successful_requests_silenced | Groups to mute for Storage sucessful requests monitor | map | `` | no | +| successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `10` | no | +| successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `30` | no | +| throttling_error_requests_silenced | Groups to mute for Storage throttling error monitor | map | `` | no | +| throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `90` | no | +| throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `50` | no | +| timeout_error_requests_silenced | Groups to mute for Storage timeout monitor | map | `` | no | +| timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `90` | no | +| timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `50` | no | Related documentation --------------------- diff --git a/cloud/azure/storage/inputs.tf b/cloud/azure/storage/inputs.tf index e48df74..f665a6e 100644 --- a/cloud/azure/storage/inputs.tf +++ b/cloud/azure/storage/inputs.tf @@ -25,47 +25,146 @@ variable "filter_tags_custom" { } # Azure Storage specific +variable "availability_silenced" { + description = "Groups to mute for Storage availability monitor" + type = "map" + default = {} +} + variable "availability_threshold_critical" { description = "Minimum acceptable percent of availability for a storage" + default = 50 +} + +variable "availability_threshold_warning" { + description = "Warning regarding acceptable percent of availability for a storage" default = 90 } +variable "successful_requests_silenced" { + description = "Groups to mute for Storage sucessful requests monitor" + type = "map" + default = {} +} + variable "successful_requests_threshold_critical" { description = "Minimum acceptable percent of successful requests for a storage" - default = 90 + default = 10 +} + +variable "successful_requests_threshold_warning" { + description = "Warning regarding acceptable percent of successful requests for a storage" + default = 30 +} + +variable "latency_silenced" { + description = "Groups to mute for Storage latency monitor" + type = "map" + default = {} } variable "latency_threshold_critical" { description = "Maximum acceptable end to end latency (ms) for a storage" + default = 2000 +} + +variable "latency_threshold_warning" { + description = "Warning regarding acceptable end to end latency (ms) for a storage" default = 1000 } +variable "timeout_error_requests_silenced" { + description = "Groups to mute for Storage timeout monitor" + type = "map" + default = {} +} + variable "timeout_error_requests_threshold_critical" { description = "Maximum acceptable percent of timeout error requests for a storage" - default = 5 + default = 90 +} + +variable "timeout_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of timeout error requests for a storage" + default = 50 +} + +variable "network_error_requests_silenced" { + description = "Groups to mute for Storage network errors monitor" + type = "map" + default = {} } variable "network_error_requests_threshold_critical" { description = "Maximum acceptable percent of network error requests for a storage" - default = 5 + default = 90 +} + +variable "network_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of network error requests for a storage" + default = 50 +} + +variable "throttling_error_requests_silenced" { + description = "Groups to mute for Storage throttling error monitor" + type = "map" + default = {} } variable "throttling_error_requests_threshold_critical" { description = "Maximum acceptable percent of throttling error requests for a storage" - default = 10 + default = 90 +} + +variable "throttling_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of throttling error requests for a storage" + default = 50 +} + +variable "server_other_error_requests_silenced" { + description = "Groups to mute for Storage server other errors monitor" + type = "map" + default = {} } variable "server_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of server other error requests for a storage" - default = 10 + default = 90 +} + +variable "server_other_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of server other error requests for a storage" + default = 50 +} + +variable "client_other_error_requests_silenced" { + description = "Groups to mute for Storage other errors monitor" + type = "map" + default = {} } variable "client_other_error_requests_threshold_critical" { description = "Maximum acceptable percent of client other error requests for a storage" - default = 15 + default = 90 +} + +variable "client_other_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of client other error requests for a storage" + default = 50 +} + +variable "authorization_error_requests_silenced" { + description = "Groups to mute for Storage authorization errors monitor" + type = "map" + default = {} } variable "authorization_error_requests_threshold_critical" { description = "Maximum acceptable percent of authorization error requests for a storage" - default = 15 + default = 90 +} + +variable "authorization_error_requests_threshold_warning" { + description = "Warning regarding acceptable percent of authorization error requests for a storage" + default = 50 } diff --git a/cloud/azure/storage/monitors-azure-storage.tf b/cloud/azure/storage/monitors-azure-storage.tf index 0e5137c..ed25396 100644 --- a/cloud/azure/storage/monitors-azure-storage.tf +++ b/cloud/azure/storage/monitors-azure-storage.tf @@ -7,7 +7,7 @@ data "template_file" "filter" { } resource "datadog_monitor" "availability" { - name = "[${var.environment}] Azure Storage {{name}} unavailability detected" + name = "[${var.environment}] Azure Storage is down" message = "${var.message}" query = <` | no | | conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no | | conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | +| failed_function_requests_silenced | Groups to mute for Stream Analytics failed requests monitor | map | `` | no | | failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no | +| failed_function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | -| function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | | message | Message sent when a Redis monitor is triggered | string | - | yes | +| runtime_errors_silenced | Groups to mute for Stream Analytics runtime errors monitor | map | `` | no | | runtime_errors_threshold_critical | Runtime errors limit (critical threshold) | string | `10` | no | | runtime_errors_threshold_warning | Runtime errors limit (warning threshold) | string | `0` | no | +| status_silenced | Groups to mute for Stream Analytics status monitor | map | `` | no | +| su_utilization_silenced | Groups to mute for Stream Analytics utilization monitor | map | `` | no | | su_utilization_threshold_critical | Streaming Unit utilization rate limit (critical threshold) | string | `80` | no | | su_utilization_threshold_warning | Streaming Unit utilization rate limit (warning threshold) | string | `60` | no | diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index ce3c713..d0e86fc 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -25,6 +25,18 @@ variable "filter_tags_custom" { } # Azure Stream Analytics specific +variable "status_silenced" { + description = "Groups to mute for Stream Analytics status monitor" + type = "map" + default = {} +} + +variable "su_utilization_silenced" { + description = "Groups to mute for Stream Analytics utilization monitor" + type = "map" + default = {} +} + variable "su_utilization_threshold_warning" { description = "Streaming Unit utilization rate limit (warning threshold)" default = 60 @@ -35,7 +47,13 @@ variable "su_utilization_threshold_critical" { default = 80 } -variable "function_requests_threshold_warning" { +variable "failed_function_requests_silenced" { + description = "Groups to mute for Stream Analytics failed requests monitor" + type = "map" + default = {} +} + +variable "failed_function_requests_threshold_warning" { description = "Failed Function Request rate limit (warning threshold)" default = 0 } @@ -45,6 +63,12 @@ variable "failed_function_requests_threshold_critical" { default = 10 } +variable "conversion_errors_silenced" { + description = "Groups to mute for Stream Analytics conversion errors monitor" + type = "map" + default = {} +} + variable "conversion_errors_threshold_warning" { description = "Conversion errors limit (warning threshold)" default = 0 @@ -55,6 +79,12 @@ variable "conversion_errors_threshold_critical" { default = 10 } +variable "runtime_errors_silenced" { + description = "Groups to mute for Stream Analytics runtime errors monitor" + type = "map" + default = {} +} + variable "runtime_errors_threshold_warning" { description = "Runtime errors limit (warning threshold)" default = 0 diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 1931eb2..71068c2 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -7,7 +7,7 @@ data "template_file" "filter" { } resource "datadog_monitor" "status" { - name = "[${var.environment}] Stream Analytics Status is not ok on {{name}}" + name = "[${var.environment}] Stream Analytics is down" message = "${var.message}" query = <