MON-455 add terminated container monitor
This commit is contained in:
parent
026c32ee94
commit
2a4bf39569
@ -17,6 +17,7 @@ module "datadog-monitors-caas-kubernetes-pod" {
|
||||
Creates DataDog monitors with the following checks:
|
||||
|
||||
- Kubernetes Pod phase status failed
|
||||
- Kubernetes Pod terminated anormaly
|
||||
- Kubernetes Pod waiting errors
|
||||
|
||||
## Inputs
|
||||
@ -43,6 +44,13 @@ Creates DataDog monitors with the following checks:
|
||||
| pod\_phase\_status\_time\_aggregator | Monitor aggregator for Pod phase status [available values: min, max or avg] | string | `"max"` | no |
|
||||
| pod\_phase\_status\_timeframe | Monitor timeframe for Pod phase status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
|
||||
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
|
||||
| terminated\_enabled | Flag to enable Pod terminated monitor | string | `"true"` | no |
|
||||
| terminated\_extra\_tags | Extra tags for Pod terminated monitor | list(string) | `[]` | no |
|
||||
| terminated\_message | Custom message for Pod terminated monitor | string | `""` | no |
|
||||
| terminated\_threshold\_critical | terminated critical threshold | string | `"0.5"` | no |
|
||||
| terminated\_threshold\_warning | terminated warning threshold | string | `"0"` | no |
|
||||
| terminated\_time\_aggregator | Monitor aggregator for Pod terminated [available values: min, max or avg] | string | `"sum"` | no |
|
||||
| terminated\_timeframe | Monitor timeframe for Pod terminated [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_10m"` | no |
|
||||
|
||||
## Outputs
|
||||
|
||||
@ -50,6 +58,7 @@ Creates DataDog monitors with the following checks:
|
||||
|------|-------------|
|
||||
| error\_id | id for monitor error |
|
||||
| pod\_phase\_status\_id | id for monitor pod_phase_status |
|
||||
| terminated\_id | id for monitor terminated |
|
||||
|
||||
## Related documentation
|
||||
|
||||
|
||||
@ -110,3 +110,43 @@ variable "error_threshold_warning" {
|
||||
description = "error warning threshold"
|
||||
}
|
||||
|
||||
variable "terminated_enabled" {
|
||||
description = "Flag to enable Pod terminated monitor"
|
||||
type = string
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "terminated_extra_tags" {
|
||||
description = "Extra tags for Pod terminated monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "terminated_message" {
|
||||
description = "Custom message for Pod terminated monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "terminated_time_aggregator" {
|
||||
description = "Monitor aggregator for Pod terminated [available values: min, max or avg]"
|
||||
type = string
|
||||
default = "sum"
|
||||
}
|
||||
|
||||
variable "terminated_timeframe" {
|
||||
description = "Monitor timeframe for Pod terminated [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
type = string
|
||||
default = "last_10m"
|
||||
}
|
||||
|
||||
variable "terminated_threshold_critical" {
|
||||
default = 0.5
|
||||
description = "terminated critical threshold"
|
||||
}
|
||||
|
||||
variable "terminated_threshold_warning" {
|
||||
default = 0
|
||||
description = "terminated warning threshold"
|
||||
}
|
||||
|
||||
|
||||
@ -57,3 +57,33 @@ EOQ
|
||||
tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform"], var.error_extra_tags)
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "terminated" {
|
||||
count = var.terminated_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Pod terminated anormaly"
|
||||
message = coalesce(var.terminated_message, var.message)
|
||||
type = "query alert"
|
||||
|
||||
query = <<EOQ
|
||||
${var.terminated_time_aggregator}(${var.terminated_timeframe}):
|
||||
sum:kubernetes_state.container.status_report.count.terminated${module.filter-tags-nocontainercreating.query_alert} by {namespace,pod,reason}
|
||||
> ${var.terminated_threshold_critical}
|
||||
EOQ
|
||||
|
||||
thresholds = {
|
||||
critical = var.terminated_threshold_critical
|
||||
warning = var.terminated_threshold_warning
|
||||
}
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
notify_no_data = false
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform"], var.terminated_extra_tags)
|
||||
}
|
||||
|
||||
|
||||
@ -8,3 +8,8 @@ output "error_id" {
|
||||
value = datadog_monitor.error.*.id
|
||||
}
|
||||
|
||||
output "terminated_id" {
|
||||
description = "id for monitor terminated"
|
||||
value = datadog_monitor.terminated.*.id
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user