From 2a4bf3956934d236eacf867ff0a21ee460913e7e Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Thu, 11 Jul 2019 18:45:51 +0200 Subject: [PATCH] MON-455 add terminated container monitor --- caas/kubernetes/pod/README.md | 9 ++++++ caas/kubernetes/pod/inputs.tf | 40 +++++++++++++++++++++++++ caas/kubernetes/pod/monitors-k8s-pod.tf | 30 +++++++++++++++++++ caas/kubernetes/pod/outputs.tf | 5 ++++ 4 files changed, 84 insertions(+) diff --git a/caas/kubernetes/pod/README.md b/caas/kubernetes/pod/README.md index 1dbae32..c6dd2b5 100644 --- a/caas/kubernetes/pod/README.md +++ b/caas/kubernetes/pod/README.md @@ -17,6 +17,7 @@ module "datadog-monitors-caas-kubernetes-pod" { Creates DataDog monitors with the following checks: - Kubernetes Pod phase status failed +- Kubernetes Pod terminated anormaly - Kubernetes Pod waiting errors ## Inputs @@ -43,6 +44,13 @@ Creates DataDog monitors with the following checks: | pod\_phase\_status\_time\_aggregator | Monitor aggregator for Pod phase status [available values: min, max or avg] | string | `"max"` | no | | pod\_phase\_status\_timeframe | Monitor timeframe for Pod phase status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no | +| terminated\_enabled | Flag to enable Pod terminated monitor | string | `"true"` | no | +| terminated\_extra\_tags | Extra tags for Pod terminated monitor | list(string) | `[]` | no | +| terminated\_message | Custom message for Pod terminated monitor | string | `""` | no | +| terminated\_threshold\_critical | terminated critical threshold | string | `"0.5"` | no | +| terminated\_threshold\_warning | terminated warning threshold | string | `"0"` | no | +| terminated\_time\_aggregator | Monitor aggregator for Pod terminated [available values: min, max or avg] | string | `"sum"` | no | +| terminated\_timeframe | Monitor timeframe for Pod terminated [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_10m"` | no | ## Outputs @@ -50,6 +58,7 @@ Creates DataDog monitors with the following checks: |------|-------------| | error\_id | id for monitor error | | pod\_phase\_status\_id | id for monitor pod_phase_status | +| terminated\_id | id for monitor terminated | ## Related documentation diff --git a/caas/kubernetes/pod/inputs.tf b/caas/kubernetes/pod/inputs.tf index 73f74da..b78b617 100644 --- a/caas/kubernetes/pod/inputs.tf +++ b/caas/kubernetes/pod/inputs.tf @@ -110,3 +110,43 @@ variable "error_threshold_warning" { description = "error warning threshold" } +variable "terminated_enabled" { + description = "Flag to enable Pod terminated monitor" + type = string + default = "true" +} + +variable "terminated_extra_tags" { + description = "Extra tags for Pod terminated monitor" + type = list(string) + default = [] +} + +variable "terminated_message" { + description = "Custom message for Pod terminated monitor" + type = string + default = "" +} + +variable "terminated_time_aggregator" { + description = "Monitor aggregator for Pod terminated [available values: min, max or avg]" + type = string + default = "sum" +} + +variable "terminated_timeframe" { + description = "Monitor timeframe for Pod terminated [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_10m" +} + +variable "terminated_threshold_critical" { + default = 0.5 + description = "terminated critical threshold" +} + +variable "terminated_threshold_warning" { + default = 0 + description = "terminated warning threshold" +} + diff --git a/caas/kubernetes/pod/monitors-k8s-pod.tf b/caas/kubernetes/pod/monitors-k8s-pod.tf index ab5154c..0d66011 100644 --- a/caas/kubernetes/pod/monitors-k8s-pod.tf +++ b/caas/kubernetes/pod/monitors-k8s-pod.tf @@ -57,3 +57,33 @@ EOQ tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform"], var.error_extra_tags) } +resource "datadog_monitor" "terminated" { + count = var.terminated_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Pod terminated anormaly" + message = coalesce(var.terminated_message, var.message) + type = "query alert" + + query = < ${var.terminated_threshold_critical} +EOQ + + thresholds = { + critical = var.terminated_threshold_critical + warning = var.terminated_threshold_warning + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + notify_no_data = false + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + + tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform"], var.terminated_extra_tags) +} + diff --git a/caas/kubernetes/pod/outputs.tf b/caas/kubernetes/pod/outputs.tf index 70cf2f9..69f066e 100644 --- a/caas/kubernetes/pod/outputs.tf +++ b/caas/kubernetes/pod/outputs.tf @@ -8,3 +8,8 @@ output "error_id" { value = datadog_monitor.error.*.id } +output "terminated_id" { + description = "id for monitor terminated" + value = datadog_monitor.terminated.*.id +} +