From ed57bb36109f5818a3ae5b2c922c85920ce8a4c2 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Tue, 7 May 2019 17:33:53 +0200 Subject: [PATCH] MON-444 mutualise pod error and crashloopbackoff --- caas/kubernetes/pod/README.md | 16 ++------ caas/kubernetes/pod/inputs.tf | 50 +------------------------ caas/kubernetes/pod/modules.tf | 22 ----------- caas/kubernetes/pod/monitors-k8s-pod.tf | 34 +---------------- caas/kubernetes/pod/outputs.tf | 5 --- 5 files changed, 6 insertions(+), 121 deletions(-) diff --git a/caas/kubernetes/pod/README.md b/caas/kubernetes/pod/README.md index 5930583..50d0946 100644 --- a/caas/kubernetes/pod/README.md +++ b/caas/kubernetes/pod/README.md @@ -16,29 +16,20 @@ module "datadog-monitors-caas-kubernetes-pod" { Creates DataDog monitors with the following checks: -- Kubernetes Pod CrashLoopBackOff -- Kubernetes Pod errors - Kubernetes Pod phase status failed +- Kubernetes Pod waiting errors ## Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| crashloopbackoff\_enabled | Flag to enable Pod crashloopbackoff monitor | string | `"true"` | no | -| crashloopbackoff\_extra\_tags | Extra tags for Pod crashloopbackoff monitor | list | `[]` | no | -| crashloopbackoff\_message | Custom message for Pod crashloopbackoff monitor | string | `""` | no | -| crashloopbackoff\_silenced | Groups to mute for Pod crashloopbackoff monitor | map | `{}` | no | -| crashloopbackoff\_threshold\_critical | crashloopbackoff critical threshold | string | `"5"` | no | -| crashloopbackoff\_threshold\_warning | crashloopbackoff warning threshold | string | `"4"` | no | -| crashloopbackoff\_time\_aggregator | Monitor aggregator for Pod crashloopbackoff [available values: min, max or avg] | string | `"sum"` | no | -| crashloopbackoff\_timeframe | Monitor timeframe for Pod crashloopbackoff [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_10m"` | no | | environment | Architecture environment | string | n/a | yes | | error\_enabled | Flag to enable Pod errors monitor | string | `"true"` | no | | error\_extra\_tags | Extra tags for Pod errors monitor | list | `[]` | no | | error\_message | Custom message for Pod errors monitor | string | `""` | no | | error\_silenced | Groups to mute for Pod errors monitor | map | `{}` | no | -| error\_threshold\_critical | error critical threshold | string | `"1"` | no | -| error\_threshold\_warning | error warning threshold | string | `"0.5"` | no | +| error\_threshold\_critical | error critical threshold | string | `"0.5"` | no | +| error\_threshold\_warning | error warning threshold | string | `"0"` | no | | error\_time\_aggregator | Monitor aggregator for Pod errors [available values: min, max or avg] | string | `"sum"` | no | | error\_timeframe | Monitor timeframe for Pod errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | | evaluation\_delay | Delay in seconds for the metric evaluation | string | `"15"` | no | @@ -59,7 +50,6 @@ Creates DataDog monitors with the following checks: | Name | Description | |------|-------------| -| crashloopbackoff\_id | id for monitor crashloopbackoff | | error\_id | id for monitor error | | pod\_phase\_status\_id | id for monitor pod_phase_status | diff --git a/caas/kubernetes/pod/inputs.tf b/caas/kubernetes/pod/inputs.tf index a0e6d0c..0c9ef8e 100644 --- a/caas/kubernetes/pod/inputs.tf +++ b/caas/kubernetes/pod/inputs.tf @@ -113,57 +113,11 @@ variable "error_timeframe" { } variable "error_threshold_critical" { - default = 1 + default = 0.5 description = "error critical threshold" } variable "error_threshold_warning" { - default = 0.5 + default = 0 description = "error warning threshold" } - -variable "crashloopbackoff_silenced" { - description = "Groups to mute for Pod crashloopbackoff monitor" - type = "map" - default = {} -} - -variable "crashloopbackoff_enabled" { - description = "Flag to enable Pod crashloopbackoff monitor" - type = "string" - default = "true" -} - -variable "crashloopbackoff_extra_tags" { - description = "Extra tags for Pod crashloopbackoff monitor" - type = "list" - default = [] -} - -variable "crashloopbackoff_message" { - description = "Custom message for Pod crashloopbackoff monitor" - type = "string" - default = "" -} - -variable "crashloopbackoff_time_aggregator" { - description = "Monitor aggregator for Pod crashloopbackoff [available values: min, max or avg]" - type = "string" - default = "sum" -} - -variable "crashloopbackoff_timeframe" { - description = "Monitor timeframe for Pod crashloopbackoff [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" - type = "string" - default = "last_10m" -} - -variable "crashloopbackoff_threshold_critical" { - default = 5 - description = "crashloopbackoff critical threshold" -} - -variable "crashloopbackoff_threshold_warning" { - default = 4 - description = "crashloopbackoff warning threshold" -} diff --git a/caas/kubernetes/pod/modules.tf b/caas/kubernetes/pod/modules.tf index 120fb5b..07c1c59 100644 --- a/caas/kubernetes/pod/modules.tf +++ b/caas/kubernetes/pod/modules.tf @@ -29,25 +29,3 @@ module "filter-tags-nocontainercreating" { filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" extra_tags_excluded = ["reason:containercreating"] } - -module "filter-tags-nocrashloopbackoff" { - source = "../../../common/filter-tags" - - environment = "${var.environment}" - resource = "kubernetes" - filter_tags_use_defaults = "${var.filter_tags_use_defaults}" - filter_tags_custom = "${var.filter_tags_custom}" - filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" - extra_tags_excluded = ["reason:crashloopbackoff"] -} - -module "filter-tags-crashloopbackoff" { - source = "../../../common/filter-tags" - - environment = "${var.environment}" - resource = "kubernetes" - filter_tags_use_defaults = "${var.filter_tags_use_defaults}" - filter_tags_custom = "${var.filter_tags_custom}" - filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" - extra_tags = ["reason:crashloopbackoff"] -} diff --git a/caas/kubernetes/pod/monitors-k8s-pod.tf b/caas/kubernetes/pod/monitors-k8s-pod.tf index 753f3dc..87a2599 100644 --- a/caas/kubernetes/pod/monitors-k8s-pod.tf +++ b/caas/kubernetes/pod/monitors-k8s-pod.tf @@ -30,7 +30,7 @@ resource "datadog_monitor" "pod_phase_status" { resource "datadog_monitor" "error" { count = "${var.error_enabled == "true" ? 1 : 0}" - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Pod errors {{#is_alert}}{{{comparator}}} {{threshold}} times ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} times ({{value}}){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Pod waiting errors" type = "metric alert" message = "${coalesce(var.error_message, var.message)}" @@ -59,35 +59,3 @@ resource "datadog_monitor" "error" { silenced = "${var.error_silenced}" tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform", "${var.error_extra_tags}"] } - -resource "datadog_monitor" "crashloopbackoff" { - count = "${var.crashloopbackoff_enabled == "true" ? 1 : 0}" - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Pod CrashLoopBackOff" - type = "metric alert" - message = "${coalesce(var.crashloopbackoff_message, var.message)}" - - query = < ${var.crashloopbackoff_threshold_critical} - EOQ - - thresholds { - critical = "${var.crashloopbackoff_threshold_critical}" - warning = "${var.crashloopbackoff_threshold_warning}" - } - - evaluation_delay = "${var.evaluation_delay}" - new_host_delay = "${var.new_host_delay}" - - notify_no_data = false - renotify_interval = 0 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - - silenced = "${var.crashloopbackoff_silenced}" - tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform", "${var.crashloopbackoff_extra_tags}"] -} diff --git a/caas/kubernetes/pod/outputs.tf b/caas/kubernetes/pod/outputs.tf index 9250444..2aceb97 100644 --- a/caas/kubernetes/pod/outputs.tf +++ b/caas/kubernetes/pod/outputs.tf @@ -7,8 +7,3 @@ output "error_id" { description = "id for monitor error" value = "${datadog_monitor.error.*.id}" } - -output "crashloopbackoff_id" { - description = "id for monitor crashloopbackoff" - value = "${datadog_monitor.crashloopbackoff.*.id}" -}