Merge branch 'MON-455_kubernetes_state.container.status_report.count.terminated' into 'master'

MON-455 add terminated container monitor

Closes MON-455

See merge request claranet/pt-monitoring/projects/datadog/terraform/monitors!76
This commit is contained in:
Quentin Manfroi 2019-07-15 14:47:12 +02:00
commit ee412ee29c
4 changed files with 84 additions and 0 deletions

View File

@ -17,6 +17,7 @@ module "datadog-monitors-caas-kubernetes-pod" {
Creates DataDog monitors with the following checks: Creates DataDog monitors with the following checks:
- Kubernetes Pod phase status failed - Kubernetes Pod phase status failed
- Kubernetes Pod terminated abnormally
- Kubernetes Pod waiting errors - Kubernetes Pod waiting errors
## Inputs ## Inputs
@ -43,6 +44,13 @@ Creates DataDog monitors with the following checks:
| pod\_phase\_status\_time\_aggregator | Monitor aggregator for Pod phase status [available values: min, max or avg] | string | `"max"` | no | | pod\_phase\_status\_time\_aggregator | Monitor aggregator for Pod phase status [available values: min, max or avg] | string | `"max"` | no |
| pod\_phase\_status\_timeframe | Monitor timeframe for Pod phase status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | pod\_phase\_status\_timeframe | Monitor timeframe for Pod phase status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no | | prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
| terminated\_enabled | Flag to enable Pod terminated monitor | string | `"true"` | no |
| terminated\_extra\_tags | Extra tags for Pod terminated monitor | list(string) | `[]` | no |
| terminated\_message | Custom message for Pod terminated monitor | string | `""` | no |
| terminated\_threshold\_critical | terminated critical threshold | string | `"0.5"` | no |
| terminated\_threshold\_warning | terminated warning threshold | string | `"0"` | no |
| terminated\_time\_aggregator | Monitor aggregator for Pod terminated [available values: min, max or avg] | string | `"sum"` | no |
| terminated\_timeframe | Monitor timeframe for Pod terminated [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_10m"` | no |
## Outputs ## Outputs
@ -50,6 +58,7 @@ Creates DataDog monitors with the following checks:
|------|-------------| |------|-------------|
| error\_id | id for monitor error | | error\_id | id for monitor error |
| pod\_phase\_status\_id | id for monitor pod_phase_status | | pod\_phase\_status\_id | id for monitor pod_phase_status |
| terminated\_id | id for monitor terminated |
## Related documentation ## Related documentation

View File

@ -110,3 +110,43 @@ variable "error_threshold_warning" {
description = "error warning threshold" description = "error warning threshold"
} }
variable "terminated_enabled" {
description = "Flag to enable Pod terminated monitor"
type = string
default = "true"
}
variable "terminated_extra_tags" {
description = "Extra tags for Pod terminated monitor"
type = list(string)
default = []
}
variable "terminated_message" {
description = "Custom message for Pod terminated monitor"
type = string
default = ""
}
variable "terminated_time_aggregator" {
description = "Monitor aggregator for Pod terminated [available values: min, max or avg]"
type = string
default = "sum"
}
variable "terminated_timeframe" {
description = "Monitor timeframe for Pod terminated [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = string
default = "last_10m"
}
variable "terminated_threshold_critical" {
default = 0.5
description = "terminated critical threshold"
}
variable "terminated_threshold_warning" {
default = 0
description = "terminated warning threshold"
}

View File

@ -57,3 +57,33 @@ EOQ
tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform"], var.error_extra_tags) tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform"], var.error_extra_tags)
} }
resource "datadog_monitor" "terminated" {
count = var.terminated_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Pod terminated abnormally"
message = coalesce(var.terminated_message, var.message)
type = "query alert"
query = <<EOQ
${var.terminated_time_aggregator}(${var.terminated_timeframe}):
sum:kubernetes_state.container.status_report.count.terminated${module.filter-tags-nocontainercreating.query_alert} by {namespace,pod,reason}
> ${var.terminated_threshold_critical}
EOQ
thresholds = {
critical = var.terminated_threshold_critical
warning = var.terminated_threshold_warning
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform"], var.terminated_extra_tags)
}

View File

@ -8,3 +8,8 @@ output "error_id" {
value = datadog_monitor.error.*.id value = datadog_monitor.error.*.id
} }
output "terminated_id" {
description = "id for monitor terminated"
value = datadog_monitor.terminated.*.id
}