MON-79 Raise critical thresholds and add warning thresholds to avoid "bagot" alerting during NBH

This commit is contained in:
Laurent Piroelle 2018-02-08 17:54:51 +01:00 committed by Quentin Manfroi
parent 1223e3b26f
commit a0ac2d7629
4 changed files with 108 additions and 0 deletions

View File

@ -312,46 +312,91 @@ variable "sqldatabase_deadlock_threshold_critical" {
# Azure Storage specific variables # Azure Storage specific variables
variable "storage_availability_threshold_critical" { variable "storage_availability_threshold_critical" {
description = "Minimum acceptable percent of availability for a storage" description = "Minimum acceptable percent of availability for a storage"
default = 50
}
variable "storage_availability_threshold_warning" {
description = "Warning regarding acceptable percent of availability for a storage"
default = 90 default = 90
} }
variable "storage_successful_requests_threshold_critical" { variable "storage_successful_requests_threshold_critical" {
description = "Minimum acceptable percent of successful requests for a storage" description = "Minimum acceptable percent of successful requests for a storage"
default = 50
}
variable "storage_successful_requests_threshold_warning" {
description = "Warning regarding acceptable percent of successful requests for a storage"
default = 90 default = 90
} }
variable "storage_latency_threshold_critical" { variable "storage_latency_threshold_critical" {
description = "Maximum acceptable end to end latency (ms) for a storage" description = "Maximum acceptable end to end latency (ms) for a storage"
default = 2000
}
variable "storage_latency_threshold_warning" {
description = "Warning regarding acceptable end to end latency (ms) for a storage"
default = 1000 default = 1000
} }
variable "storage_timeout_error_requests_threshold_critical" { variable "storage_timeout_error_requests_threshold_critical" {
description = "Maximum acceptable percent of timeout error requests for a storage" description = "Maximum acceptable percent of timeout error requests for a storage"
default = 50
}
variable "storage_timeout_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of timeout error requests for a storage"
default = 5 default = 5
} }
variable "storage_network_error_requests_threshold_critical" { variable "storage_network_error_requests_threshold_critical" {
description = "Maximum acceptable percent of network error requests for a storage" description = "Maximum acceptable percent of network error requests for a storage"
default = 50
}
variable "storage_network_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of network error requests for a storage"
default = 5 default = 5
} }
variable "storage_throttling_error_requests_threshold_critical" { variable "storage_throttling_error_requests_threshold_critical" {
description = "Maximum acceptable percent of throttling error requests for a storage" description = "Maximum acceptable percent of throttling error requests for a storage"
default = 50
}
variable "storage_throttling_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of throttling error requests for a storage"
default = 10 default = 10
} }
variable "storage_server_other_error_requests_threshold_critical" { variable "storage_server_other_error_requests_threshold_critical" {
description = "Maximum acceptable percent of server other error requests for a storage" description = "Maximum acceptable percent of server other error requests for a storage"
default = 50
}
variable "storage_server_other_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of server other error requests for a storage"
default = 10 default = 10
} }
variable "storage_client_other_error_requests_threshold_critical" { variable "storage_client_other_error_requests_threshold_critical" {
description = "Maximum acceptable percent of client other error requests for a storage" description = "Maximum acceptable percent of client other error requests for a storage"
default = 50
}
variable "storage_client_other_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of client other error requests for a storage"
default = 15 default = 15
} }
variable "storage_authorization_error_requests_threshold_critical" { variable "storage_authorization_error_requests_threshold_critical" {
description = "Maximum acceptable percent of authorization error requests for a storage" description = "Maximum acceptable percent of authorization error requests for a storage"
default = 50
}
variable "storage_authorization_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of authorization error requests for a storage"
default = 15 default = 15
} }

View File

@ -135,14 +135,23 @@ module "storage" {
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = "${var.filter_tags_custom}"
authorization_error_requests_threshold_critical = "${var.storage_authorization_error_requests_threshold_critical}" authorization_error_requests_threshold_critical = "${var.storage_authorization_error_requests_threshold_critical}"
authorization_error_requests_threshold_warning = "${var.storage_authorization_error_requests_threshold_warning}"
availability_threshold_critical = "${var.storage_availability_threshold_critical}" availability_threshold_critical = "${var.storage_availability_threshold_critical}"
availability_threshold_warning = "${var.storage_availability_threshold_warning}"
client_other_error_requests_threshold_critical = "${var.storage_client_other_error_requests_threshold_critical}" client_other_error_requests_threshold_critical = "${var.storage_client_other_error_requests_threshold_critical}"
client_other_error_requests_threshold_warning = "${var.storage_client_other_error_requests_threshold_warning}"
latency_threshold_critical = "${var.storage_latency_threshold_critical}" latency_threshold_critical = "${var.storage_latency_threshold_critical}"
latency_threshold_warning = "${var.storage_latency_threshold_warning}"
network_error_requests_threshold_critical = "${var.storage_network_error_requests_threshold_critical}" network_error_requests_threshold_critical = "${var.storage_network_error_requests_threshold_critical}"
network_error_requests_threshold_warning = "${var.storage_network_error_requests_threshold_warning}"
server_other_error_requests_threshold_critical = "${var.storage_server_other_error_requests_threshold_critical}" server_other_error_requests_threshold_critical = "${var.storage_server_other_error_requests_threshold_critical}"
server_other_error_requests_threshold_warning = "${var.storage_server_other_error_requests_threshold_warning}"
successful_requests_threshold_critical = "${var.storage_successful_requests_threshold_critical}" successful_requests_threshold_critical = "${var.storage_successful_requests_threshold_critical}"
successful_requests_threshold_warning = "${var.storage_successful_requests_threshold_warning}"
throttling_error_requests_threshold_critical = "${var.storage_throttling_error_requests_threshold_critical}" throttling_error_requests_threshold_critical = "${var.storage_throttling_error_requests_threshold_critical}"
throttling_error_requests_threshold_warning = "${var.storage_throttling_error_requests_threshold_warning}"
timeout_error_requests_threshold_critical = "${var.storage_timeout_error_requests_threshold_critical}" timeout_error_requests_threshold_critical = "${var.storage_timeout_error_requests_threshold_critical}"
timeout_error_requests_threshold_warning = "${var.storage_timeout_error_requests_threshold_warning}"
} }
module "streamanalytics" { module "streamanalytics" {

View File

@ -27,45 +27,90 @@ variable "filter_tags_custom" {
# Azure Storage specific # Azure Storage specific
variable "availability_threshold_critical" { variable "availability_threshold_critical" {
description = "Minimum acceptable percent of availability for a storage" description = "Minimum acceptable percent of availability for a storage"
default = 50
}
variable "availability_threshold_warning" {
description = "Warning regarding acceptable percent of availability for a storage"
default = 90 default = 90
} }
variable "successful_requests_threshold_critical" { variable "successful_requests_threshold_critical" {
description = "Minimum acceptable percent of successful requests for a storage" description = "Minimum acceptable percent of successful requests for a storage"
default = 50
}
variable "successful_requests_threshold_warning" {
description = "Warning regarding acceptable percent of successful requests for a storage"
default = 90 default = 90
} }
variable "latency_threshold_critical" { variable "latency_threshold_critical" {
description = "Maximum acceptable end to end latency (ms) for a storage" description = "Maximum acceptable end to end latency (ms) for a storage"
default = 2000
}
variable "latency_threshold_warning" {
description = "Warning regarding acceptable end to end latency (ms) for a storage"
default = 1000 default = 1000
} }
variable "timeout_error_requests_threshold_critical" { variable "timeout_error_requests_threshold_critical" {
description = "Maximum acceptable percent of timeout error requests for a storage" description = "Maximum acceptable percent of timeout error requests for a storage"
default = 50
}
variable "timeout_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of timeout error requests for a storage"
default = 5 default = 5
} }
variable "network_error_requests_threshold_critical" { variable "network_error_requests_threshold_critical" {
description = "Maximum acceptable percent of network error requests for a storage" description = "Maximum acceptable percent of network error requests for a storage"
default = 50
}
variable "network_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of network error requests for a storage"
default = 5 default = 5
} }
variable "throttling_error_requests_threshold_critical" { variable "throttling_error_requests_threshold_critical" {
description = "Maximum acceptable percent of throttling error requests for a storage" description = "Maximum acceptable percent of throttling error requests for a storage"
default = 50
}
variable "throttling_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of throttling error requests for a storage"
default = 10 default = 10
} }
variable "server_other_error_requests_threshold_critical" { variable "server_other_error_requests_threshold_critical" {
description = "Maximum acceptable percent of server other error requests for a storage" description = "Maximum acceptable percent of server other error requests for a storage"
default = 50
}
variable "server_other_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of server other error requests for a storage"
default = 10 default = 10
} }
variable "client_other_error_requests_threshold_critical" { variable "client_other_error_requests_threshold_critical" {
description = "Maximum acceptable percent of client other error requests for a storage" description = "Maximum acceptable percent of client other error requests for a storage"
default = 50
}
variable "client_other_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of client other error requests for a storage"
default = 15 default = 15
} }
variable "authorization_error_requests_threshold_critical" { variable "authorization_error_requests_threshold_critical" {
description = "Maximum acceptable percent of authorization error requests for a storage" description = "Maximum acceptable percent of authorization error requests for a storage"
default = 50
}
variable "authorization_error_requests_threshold_warning" {
description = "Warning regarding acceptable percent of authorization error requests for a storage"
default = 15 default = 15
} }

View File

@ -18,6 +18,7 @@ EOF
thresholds { thresholds {
critical = "${var.availability_threshold_critical}" critical = "${var.availability_threshold_critical}"
warning = "${var.availability_threshold_warning}"
} }
type = "metric alert" type = "metric alert"
@ -47,6 +48,7 @@ EOF
thresholds { thresholds {
critical = "${var.successful_requests_threshold_critical}" critical = "${var.successful_requests_threshold_critical}"
warning = "${var.successful_requests_threshold_warning}"
} }
type = "metric alert" type = "metric alert"
@ -76,6 +78,7 @@ EOF
thresholds { thresholds {
critical = "${var.latency_threshold_critical}" critical = "${var.latency_threshold_critical}"
warning = "${var.latency_threshold_warning}"
} }
type = "metric alert" type = "metric alert"
@ -105,6 +108,7 @@ EOF
thresholds { thresholds {
critical = "${var.timeout_error_requests_threshold_critical}" critical = "${var.timeout_error_requests_threshold_critical}"
warning = "${var.timeout_error_requests_threshold_warning}"
} }
type = "metric alert" type = "metric alert"
@ -134,6 +138,7 @@ EOF
thresholds { thresholds {
critical = "${var.network_error_requests_threshold_critical}" critical = "${var.network_error_requests_threshold_critical}"
warning = "${var.network_error_requests_threshold_warning}"
} }
type = "metric alert" type = "metric alert"
@ -163,6 +168,7 @@ EOF
thresholds { thresholds {
critical = "${var.throttling_error_requests_threshold_critical}" critical = "${var.throttling_error_requests_threshold_critical}"
warning = "${var.throttling_error_requests_threshold_warning}"
} }
type = "metric alert" type = "metric alert"
@ -192,6 +198,7 @@ EOF
thresholds { thresholds {
critical = "${var.server_other_error_requests_threshold_critical}" critical = "${var.server_other_error_requests_threshold_critical}"
warning = "${var.server_other_error_requests_threshold_warning}"
} }
type = "metric alert" type = "metric alert"
@ -221,6 +228,7 @@ EOF
thresholds { thresholds {
critical = "${var.client_other_error_requests_threshold_critical}" critical = "${var.client_other_error_requests_threshold_critical}"
warning = "${var.client_other_error_requests_threshold_warning}"
} }
type = "metric alert" type = "metric alert"
@ -250,6 +258,7 @@ EOF
thresholds { thresholds {
critical = "${var.authorization_error_requests_threshold_critical}" critical = "${var.authorization_error_requests_threshold_critical}"
warning = "${var.authorization_error_requests_threshold_warning}"
} }
type = "metric alert" type = "metric alert"