MON-73 Raise all thresholds involving erroneous application behaviors to only handle possible infrastructure failures

This commit is contained in:
Laurent Piroelle 2018-02-13 15:34:31 +01:00 committed by Quentin Manfroi
parent 97192755c4
commit fed0d592e9
14 changed files with 247 additions and 191 deletions

View File

@ -31,46 +31,50 @@ Inputs
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| apimanagement_failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no |
| apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no |
| apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no |
| apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no |
| appservices_http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no |
| appservices_http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no |
| appservices_http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no |
| appservices_http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no |
| appservices_http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `90` | no |
| appservices_http_successful_requests_threshold_warning | Minimum warning acceptable percent of 2xx & 3xx requests | string | `95` | no |
| appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
| appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
| appservices_response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no |
| appservices_response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no |
| apimanagement_failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `90` | no |
| apimanagement_failed_requests_threshold_warning | Warning regarding acceptable percent of failed requests | string | `50` | no |
| apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `90` | no |
| apimanagement_other_requests_threshold_warning | Warning regarding acceptable percent of other requests | string | `50` | no |
| apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `10` | no |
| apimanagement_successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests | string | `30` | no |
| apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `90` | no |
| apimanagement_unauthorized_requests_threshold_warning | Warning regarding acceptable percent of unauthorized requests | string | `50` | no |
| appservices_http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `90` | no |
| appservices_http_4xx_requests_threshold_warning | Warning regarding acceptable percent of 4xx errors | string | `50` | no |
| appservices_http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `90` | no |
| appservices_http_5xx_requests_threshold_warning | Warning regarding acceptable percent of 5xx errors | string | `50` | no |
| appservices_http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `10` | no |
| appservices_http_successful_requests_threshold_warning | Warning regarding acceptable percent of 2xx & 3xx requests | string | `30` | no |
| appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `1073741824` | no |
| appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `536870912` | no |
| appservices_response_time_threshold_critical | Alerting threshold in seconds | string | `10` | no |
| appservices_response_time_threshold_warning | Warning threshold in seconds | string | `5` | no |
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
| environment | Architecture environment | string | - | yes |
| eventhub_errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no |
| eventhub_errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no |
| eventhub_failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no |
| eventhub_failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no |
| eventhub_errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `90` | no |
| eventhub_errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `50` | no |
| eventhub_failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `90` | no |
| eventhub_failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `50` | no |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| iothub_dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no |
| iothub_dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no |
| iothub_failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no |
| iothub_failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no |
| iothub_failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no |
| iothub_failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no |
| iothub_failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no |
| iothub_failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no |
| iothub_failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no |
| iothub_failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no |
| iothub_failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no |
| iothub_failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no |
| iothub_failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no |
| iothub_failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no |
| iothub_failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no |
| iothub_failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no |
| iothub_failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no |
| iothub_failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no |
| iothub_failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `50` | no |
| iothub_failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `50` | no |
| iothub_failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `50` | no |
| iothub_failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `50` | no |
| iothub_failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `50` | no |
| iothub_failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `50` | no |
| iothub_failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `50` | no |
| iothub_failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no |
| iothub_fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no |
| iothub_fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no |
| iothub_invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no |
@ -92,24 +96,24 @@ Inputs
| sqldatabase_diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no |
| sqldatabase_dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no |
| sqldatabase_dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no |
| storage_authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `50` | no |
| storage_authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `15` | no |
| storage_authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `90` | no |
| storage_authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `50` | no |
| storage_availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `50` | no |
| storage_availability_threshold_warning | Warning regarding acceptable percent of availability for a storage | string | `90` | no |
| storage_client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `50` | no |
| storage_client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `15` | no |
| storage_client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `90` | no |
| storage_client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `50` | no |
| storage_latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `2000` | no |
| storage_latency_threshold_warning | Warning regarding acceptable end to end latency (ms) for a storage | string | `1000` | no |
| storage_network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `50` | no |
| storage_network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `5` | no |
| storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `50` | no |
| storage_server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `10` | no |
| storage_network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `90` | no |
| storage_network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `50` | no |
| storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `90` | no |
| storage_server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `50` | no |
| storage_successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `10` | no |
| storage_successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `30` | no |
| storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `50` | no |
| storage_throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `10` | no |
| storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `50` | no |
| storage_timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `5` | no |
| storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `90` | no |
| storage_throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `50` | no |
| storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `90` | no |
| storage_timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `50` | no |
| streamanalytics_conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no |
| streamanalytics_conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no |
| streamanalytics_failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no |

View File

@ -29,13 +29,17 @@ Inputs
|------|-------------|:----:|:-----:|:-----:|
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
| environment | Architecture environment | string | - | yes |
| failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no |
| failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `90` | no |
| failed_requests_threshold_warning | Warning regarding acceptable percent of failed requests | string | `50` | no |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| message | Message sent when a Redis monitor is triggered | string | - | yes |
| other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no |
| successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no |
| unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no |
| other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `90` | no |
| other_requests_threshold_warning | Warning regarding acceptable percent of other requests | string | `50` | no |
| successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `10` | no |
| successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests | string | `30` | no |
| unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `90` | no |
| unauthorized_requests_threshold_warning | Warning regarding acceptable percent of unauthorized requests | string | `50` | no |
Related documentation
---------------------

View File

@ -27,20 +27,40 @@ variable "filter_tags_custom" {
# Azure API Management specific
variable "failed_requests_threshold_critical" {
description = "Maximum acceptable percent of failed requests"
default = 5
default = 90
}
variable "failed_requests_threshold_warning" {
description = "Warning regarding acceptable percent of failed requests"
default = 50
}
variable "other_requests_threshold_critical" {
description = "Maximum acceptable percent of other requests"
default = 5
default = 90
}
variable "other_requests_threshold_warning" {
description = "Warning regarding acceptable percent of other requests"
default = 50
}
variable "unauthorized_requests_threshold_critical" {
description = "Maximum acceptable percent of unauthorized requests"
default = 5
default = 90
}
variable "unauthorized_requests_threshold_warning" {
description = "Warning regarding acceptable percent of unauthorized requests"
default = 50
}
variable "successful_requests_threshold_critical" {
description = "Minimum acceptable percent of successful requests"
default = 90
default = 10
}
variable "successful_requests_threshold_warning" {
description = "Warning regarding acceptable percent of successful requests"
default = 30
}

View File

@ -49,6 +49,7 @@ resource "datadog_monitor" "apimgt_failed_requests" {
thresholds {
critical = "${var.failed_requests_threshold_critical}"
warning = "${var.failed_requests_threshold_warning}"
}
type = "metric alert"
@ -79,6 +80,7 @@ resource "datadog_monitor" "apimgt_other_requests" {
thresholds {
critical = "${var.other_requests_threshold_critical}"
warning = "${var.other_requests_threshold_warning}"
}
type = "metric alert"
@ -109,6 +111,7 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" {
thresholds {
critical = "${var.unauthorized_requests_threshold_critical}"
warning = "${var.unauthorized_requests_threshold_warning}"
}
type = "metric alert"
@ -139,6 +142,7 @@ resource "datadog_monitor" "apimgt_successful_requests" {
thresholds {
critical = "${var.successful_requests_threshold_critical}"
warning = "${var.successful_requests_threshold_warning}"
}
type = "metric alert"

View File

@ -32,17 +32,17 @@ Inputs
| environment | Architecture environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `90` | no |
| http_successful_requests_threshold_warning | Minimum warning acceptable percent of 2xx & 3xx requests | string | `95` | no |
| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no |
| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no |
| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no |
| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no |
| memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
| memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `90` | no |
| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `50` | no |
| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `90` | no |
| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `50` | no |
| http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `10` | no |
| http_successful_requests_threshold_warning | Minimum warning acceptable percent of 2xx & 3xx requests | string | `30` | no |
| memory_usage_threshold_critical | Alerting threshold in Mib | string | `1073741824` | no |
| memory_usage_threshold_warning | Warning threshold in MiB | string | `536870912` | no |
| message | Message sent when a monitor is triggered | string | - | yes |
| response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no |
| response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no |
| response_time_threshold_critical | Alerting threshold in seconds | string | `10` | no |
| response_time_threshold_warning | Warning threshold in seconds | string | `5` | no |
Related documentation
---------------------

View File

@ -27,12 +27,12 @@ variable "delay" {
###################################
variable "response_time_threshold_critical" {
default = 0.8
default = 10
description = "Alerting threshold in seconds"
}
variable "response_time_threshold_warning" {
default = 0.4
default = 5
description = "Warning threshold in seconds"
}
@ -41,12 +41,12 @@ variable "response_time_threshold_warning" {
###################################
variable "memory_usage_threshold_critical" {
default = 52430000
default = 1073741824 # 1Gb
description = "Alerting threshold in Mib"
}
variable "memory_usage_threshold_warning" {
default = 33550000
default = 536870912 # 512Mb
description = "Warning threshold in MiB"
}
@ -55,13 +55,13 @@ variable "memory_usage_threshold_warning" {
#################################
variable "http_5xx_requests_threshold_critical" {
default = 20
default = 90
description = "Maximum critical acceptable percent of 5xx errors"
}
variable "http_5xx_requests_threshold_warning" {
default = 10
description = "Maximum warning acceptable percent of 5xx errors"
default = 50
description = "Warning regarding acceptable percent of 5xx errors"
}
#################################
@ -69,13 +69,13 @@ variable "http_5xx_requests_threshold_warning" {
#################################
variable "http_4xx_requests_threshold_critical" {
default = 30
default = 90
description = "Maximum critical acceptable percent of 4xx errors"
}
variable "http_4xx_requests_threshold_warning" {
default = 15
description = "Maximum warning acceptable percent of 4xx errors"
default = 50
description = "Warning regarding acceptable percent of 4xx errors"
}
#################################
@ -83,11 +83,11 @@ variable "http_4xx_requests_threshold_warning" {
#################################
variable "http_successful_requests_threshold_critical" {
default = 90
default = 10
description = "Minimum critical acceptable percent of 2xx & 3xx requests"
}
variable "http_successful_requests_threshold_warning" {
default = 95
description = "Minimum warning acceptable percent of 2xx & 3xx requests"
default = 30
description = "Warning regarding acceptable percent of 2xx & 3xx requests"
}

View File

@ -29,10 +29,10 @@ Inputs
|------|-------------|:----:|:-----:|:-----:|
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
| environment | Architecture environment | string | - | yes |
| errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no |
| errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no |
| failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no |
| failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no |
| errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `90` | no |
| errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `50` | no |
| failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `90` | no |
| failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `50` | no |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| message | Message sent when an alert is triggered | string | - | yes |

View File

@ -26,20 +26,20 @@ variable "filter_tags_custom" {
variable "failed_requests_rate_thresold_critical" {
description = "Failed requests ratio (percentage) to trigger the critical alert"
default = 3
default = 90
}
variable "failed_requests_rate_thresold_warning" {
description = "Failed requests ratio (percentage) to trigger a warning alert"
default = 1
default = 50
}
variable "errors_rate_thresold_critical" {
description = "Errors ratio (percentage) to trigger the critical alert"
default = 3
default = 90
}
variable "errors_rate_thresold_warning" {
description = "Errors ratio (percentage) to trigger a warning alert"
default = 1
default = 50
}

View File

@ -31,175 +31,195 @@ variable "non_taggable_filter_tags" {
# Azure API Management specific variables
variable "apimanagement_failed_requests_threshold_critical" {
description = "Maximum acceptable percent of failed requests"
default = 5
default = 90
}
variable "apimanagement_failed_requests_threshold_warning" {
description = "Warning regarding acceptable percent of failed requests"
default = 50
}
variable "apimanagement_other_requests_threshold_critical" {
description = "Maximum acceptable percent of other requests"
default = 5
default = 90
}
variable "apimanagement_other_requests_threshold_warning" {
description = "Warning regarding acceptable percent of other requests"
default = 50
}
variable "apimanagement_unauthorized_requests_threshold_critical" {
description = "Maximum acceptable percent of unauthorized requests"
default = 5
default = 90
}
variable "apimanagement_unauthorized_requests_threshold_warning" {
description = "Warning regarding acceptable percent of unauthorized requests"
default = 50
}
variable "apimanagement_successful_requests_threshold_critical" {
description = "Minimum acceptable percent of successful requests"
default = 90
default = 10
}
variable "apimanagement_successful_requests_threshold_warning" {
description = "Warning regarding acceptable percent of successful requests"
default = 30
}
# Azure App Services specific variables
variable "appservices_response_time_threshold_critical" {
default = 0.8
default = 10
description = "Alerting threshold in seconds"
}
variable "appservices_response_time_threshold_warning" {
default = 0.4
default = 5
description = "Warning threshold in seconds"
}
variable "appservices_memory_usage_threshold_critical" {
default = 52430000
default = 1073741824 # 1Gb
description = "Alerting threshold in Mib"
}
variable "appservices_memory_usage_threshold_warning" {
default = 33550000
default = 536870912 # 512Mb
description = "Warning threshold in MiB"
}
variable "appservices_http_4xx_requests_threshold_critical" {
default = 30
default = 90
description = "Maximum critical acceptable percent of 4xx errors"
}
variable "appservices_http_4xx_requests_threshold_warning" {
default = 15
description = "Maximum warning acceptable percent of 4xx errors"
default = 50
description = "Warning regarding acceptable percent of 4xx errors"
}
variable "appservices_http_5xx_requests_threshold_critical" {
default = 20
default = 90
description = "Maximum critical acceptable percent of 5xx errors"
}
variable "appservices_http_5xx_requests_threshold_warning" {
default = 10
description = "Maximum warning acceptable percent of 5xx errors"
default = 50
description = "Warning regarding acceptable percent of 5xx errors"
}
variable "appservices_http_successful_requests_threshold_critical" {
default = 90
default = 10
description = "Minimum critical acceptable percent of 2xx & 3xx requests"
}
variable "appservices_http_successful_requests_threshold_warning" {
default = 95
description = "Minimum warning acceptable percent of 2xx & 3xx requests"
default = 30
description = "Warning regarding acceptable percent of 2xx & 3xx requests"
}
# Azure Event Hub specific variables
variable "eventhub_failed_requests_rate_thresold_critical" {
description = "Failed requests ratio (percentage) to trigger the critical alert"
default = 3
default = 90
}
variable "eventhub_failed_requests_rate_thresold_warning" {
description = "Failed requests ratio (percentage) to trigger a warning alert"
default = 1
default = 50
}
variable "eventhub_errors_rate_thresold_critical" {
description = "Errors ratio (percentage) to trigger the critical alert"
default = 3
default = 90
}
variable "eventhub_errors_rate_thresold_warning" {
description = "Errors ratio (percentage) to trigger a warning alert"
default = 1
default = 50
}
# IOT Hub specific variables
variable "iothub_failed_jobs_rate_threshold_warning" {
description = "Jobs Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "iothub_failed_jobs_rate_threshold_critical" {
description = "Jobs Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "iothub_failed_listjobs_rate_threshold_warning" {
description = "ListJobs Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "iothub_failed_listjobs_rate_threshold_critical" {
description = "ListJobs Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "iothub_failed_queryjobs_rate_threshold_warning" {
description = "QueryJobs Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "iothub_failed_queryjobs_rate_threshold_critical" {
description = "QueryJobs Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "iothub_failed_c2d_methods_rate_threshold_warning" {
description = "C2D Methods Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "iothub_failed_c2d_methods_rate_threshold_critical" {
description = "C2D Methods Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "iothub_failed_c2d_twin_read_rate_threshold_warning" {
description = "C2D Twin Read Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "iothub_failed_c2d_twin_read_rate_threshold_critical" {
description = "C2D Twin Read Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "iothub_failed_c2d_twin_update_rate_threshold_warning" {
description = "C2D Twin Update Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "iothub_failed_c2d_twin_update_rate_threshold_critical" {
description = "C2D Twin Update Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "iothub_failed_d2c_twin_read_rate_threshold_warning" {
description = "D2C Twin Read Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "iothub_failed_d2c_twin_read_rate_threshold_critical" {
description = "D2C Twin Read Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "iothub_failed_d2c_twin_update_rate_threshold_warning" {
description = "D2C Twin Update Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "iothub_failed_d2c_twin_update_rate_threshold_critical" {
description = "D2C Twin Update Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "iothub_dropped_d2c_telemetry_egress_threshold_warning" {
@ -342,62 +362,62 @@ variable "storage_latency_threshold_warning" {
variable "storage_timeout_error_requests_threshold_critical" {
description = "Maximum acceptable percent of timeout error requests for a storage"
default = 50
default = 90
}
variable "storage_timeout_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of timeout error requests for a storage"
default = 5
default = 50
}
variable "storage_network_error_requests_threshold_critical" {
description = "Maximum acceptable percent of network error requests for a storage"
default = 50
default = 90
}
variable "storage_network_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of network error requests for a storage"
default = 5
default = 50
}
variable "storage_throttling_error_requests_threshold_critical" {
description = "Maximum acceptable percent of throttling error requests for a storage"
default = 50
default = 90
}
variable "storage_throttling_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of throttling error requests for a storage"
default = 10
default = 50
}
variable "storage_server_other_error_requests_threshold_critical" {
description = "Maximum acceptable percent of server other error requests for a storage"
default = 50
default = 90
}
variable "storage_server_other_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of server other error requests for a storage"
default = 10
default = 50
}
variable "storage_client_other_error_requests_threshold_critical" {
description = "Maximum acceptable percent of client other error requests for a storage"
default = 50
default = 90
}
variable "storage_client_other_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of client other error requests for a storage"
default = 15
default = 50
}
variable "storage_authorization_error_requests_threshold_critical" {
description = "Maximum acceptable percent of authorization error requests for a storage"
default = 50
default = 90
}
variable "storage_authorization_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of authorization error requests for a storage"
default = 15
default = 50
}
# Azure Stream Analytics specific variables

View File

@ -42,25 +42,25 @@ Inputs
| dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no |
| dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no |
| environment | Architecture Environment | string | - | yes |
| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no |
| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no |
| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no |
| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no |
| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no |
| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no |
| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no |
| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no |
| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no |
| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no |
| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no |
| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no |
| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no |
| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no |
| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no |
| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no |
| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `90` | no |
| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `50` | no |
| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `90` | no |
| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `50` | no |
| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `90` | no |
| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `50` | no |
| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `90` | no |
| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `50` | no |
| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `90` | no |
| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `50` | no |
| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `90` | no |
| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `50` | no |
| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `90` | no |
| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `50` | no |
| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `90` | no |
| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no |
| fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no |
| fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no |
| filter_tags | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags | Tags used for filtering | string | `*` | no |
| invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no |
| invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no |
| message | Message sent when an alert is triggered | string | - | yes |

View File

@ -22,82 +22,82 @@ variable "filter_tags" {
# Azure IOT hubs specific
variable "failed_jobs_rate_threshold_warning" {
description = "Jobs Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "failed_jobs_rate_threshold_critical" {
description = "Jobs Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "failed_listjobs_rate_threshold_warning" {
description = "ListJobs Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "failed_listjobs_rate_threshold_critical" {
description = "ListJobs Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "failed_queryjobs_rate_threshold_warning" {
description = "QueryJobs Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "failed_queryjobs_rate_threshold_critical" {
description = "QueryJobs Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "failed_c2d_methods_rate_threshold_warning" {
description = "C2D Methods Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "failed_c2d_methods_rate_threshold_critical" {
description = "C2D Methods Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "failed_c2d_twin_read_rate_threshold_warning" {
description = "C2D Twin Read Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "failed_c2d_twin_read_rate_threshold_critical" {
description = "C2D Twin Read Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "failed_c2d_twin_update_rate_threshold_warning" {
description = "C2D Twin Update Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "failed_c2d_twin_update_rate_threshold_critical" {
description = "C2D Twin Update Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "failed_d2c_twin_read_rate_threshold_warning" {
description = "D2C Twin Read Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "failed_d2c_twin_read_rate_threshold_critical" {
description = "D2C Twin Read Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "failed_d2c_twin_update_rate_threshold_warning" {
description = "D2C Twin Update Failed rate limit (warning threshold)"
default = 0
default = 50
}
variable "failed_d2c_twin_update_rate_threshold_critical" {
description = "D2C Twin Update Failed rate limit (critical threshold)"
default = 10
default = 90
}
variable "dropped_d2c_telemetry_egress_threshold_warning" {

View File

@ -9,9 +9,13 @@ module "apimanagement" {
filter_tags_custom = "${var.filter_tags_custom}"
failed_requests_threshold_critical = "${var.apimanagement_failed_requests_threshold_critical}"
failed_requests_threshold_warning = "${var.apimanagement_failed_requests_threshold_warning}"
other_requests_threshold_critical = "${var.apimanagement_other_requests_threshold_critical}"
other_requests_threshold_warning = "${var.apimanagement_other_requests_threshold_warning}"
successful_requests_threshold_critical = "${var.apimanagement_successful_requests_threshold_critical}"
successful_requests_threshold_warning = "${var.apimanagement_successful_requests_threshold_warning}"
unauthorized_requests_threshold_critical = "${var.apimanagement_unauthorized_requests_threshold_critical}"
unauthorized_requests_threshold_warning = "${var.apimanagement_unauthorized_requests_threshold_warning}"
}
module "appservices" {

View File

@ -32,12 +32,12 @@ Inputs
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `50` | no |
| authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `15` | no |
| authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `90` | no |
| authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `50` | no |
| availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `50` | no |
| availability_threshold_warning | Warning regarding acceptable percent of availability for a storage | string | `90` | no |
| client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `50` | no |
| client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `15` | no |
| client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `90` | no |
| client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `50` | no |
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
| environment | Architecture environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
@ -45,16 +45,16 @@ Inputs
| latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `2000` | no |
| latency_threshold_warning | Warning regarding acceptable end to end latency (ms) for a storage | string | `1000` | no |
| message | Message sent when a Redis monitor is triggered | string | - | yes |
| network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `50` | no |
| network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `5` | no |
| server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `50` | no |
| server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `10` | no |
| network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `90` | no |
| network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `50` | no |
| server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `90` | no |
| server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `50` | no |
| successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `10` | no |
| successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `30` | no |
| throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `50` | no |
| throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `10` | no |
| timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `50` | no |
| timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `5` | no |
| throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `90` | no |
| throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `50` | no |
| timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `90` | no |
| timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `50` | no |
Related documentation
---------------------

View File

@ -57,60 +57,60 @@ variable "latency_threshold_warning" {
variable "timeout_error_requests_threshold_critical" {
description = "Maximum acceptable percent of timeout error requests for a storage"
default = 50
default = 90
}
variable "timeout_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of timeout error requests for a storage"
default = 5
default = 50
}
variable "network_error_requests_threshold_critical" {
description = "Maximum acceptable percent of network error requests for a storage"
default = 50
default = 90
}
variable "network_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of network error requests for a storage"
default = 5
default = 50
}
variable "throttling_error_requests_threshold_critical" {
description = "Maximum acceptable percent of throttling error requests for a storage"
default = 50
default = 90
}
variable "throttling_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of throttling error requests for a storage"
default = 10
default = 50
}
variable "server_other_error_requests_threshold_critical" {
description = "Maximum acceptable percent of server other error requests for a storage"
default = 50
default = 90
}
variable "server_other_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of server other error requests for a storage"
default = 10
default = 50
}
variable "client_other_error_requests_threshold_critical" {
description = "Maximum acceptable percent of client other error requests for a storage"
default = 50
default = 90
}
variable "client_other_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of client other error requests for a storage"
default = 15
default = 50
}
variable "authorization_error_requests_threshold_critical" {
description = "Maximum acceptable percent of authorization error requests for a storage"
default = 50
default = 90
}
variable "authorization_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of authorization error requests for a storage"
default = 15
default = 50
}