Merge branch 'MON-455_kubernetes_state.container.status_report.count.terminated' into 'master'
MON-455 add terminated container monitor Closes MON-455 See merge request claranet/pt-monitoring/projects/datadog/terraform/monitors!76
This commit is contained in:
commit
ee412ee29c
@ -17,6 +17,7 @@ module "datadog-monitors-caas-kubernetes-pod" {
|
|||||||
Creates DataDog monitors with the following checks:
|
Creates DataDog monitors with the following checks:
|
||||||
|
|
||||||
- Kubernetes Pod phase status failed
|
- Kubernetes Pod phase status failed
|
||||||
|
- Kubernetes Pod terminated abnormally
|
||||||
- Kubernetes Pod waiting errors
|
- Kubernetes Pod waiting errors
|
||||||
|
|
||||||
## Inputs
|
## Inputs
|
||||||
@ -43,6 +44,13 @@ Creates DataDog monitors with the following checks:
|
|||||||
| pod\_phase\_status\_time\_aggregator | Monitor aggregator for Pod phase status [available values: min, max or avg] | string | `"max"` | no |
|
| pod\_phase\_status\_time\_aggregator | Monitor aggregator for Pod phase status [available values: min, max or avg] | string | `"max"` | no |
|
||||||
| pod\_phase\_status\_timeframe | Monitor timeframe for Pod phase status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
|
| pod\_phase\_status\_timeframe | Monitor timeframe for Pod phase status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
|
||||||
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
|
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
|
||||||
|
| terminated\_enabled | Flag to enable Pod terminated monitor | string | `"true"` | no |
|
||||||
|
| terminated\_extra\_tags | Extra tags for Pod terminated monitor | list(string) | `[]` | no |
|
||||||
|
| terminated\_message | Custom message for Pod terminated monitor | string | `""` | no |
|
||||||
|
| terminated\_threshold\_critical | terminated critical threshold | string | `"0.5"` | no |
|
||||||
|
| terminated\_threshold\_warning | terminated warning threshold | string | `"0"` | no |
|
||||||
|
| terminated\_time\_aggregator | Monitor aggregator for Pod terminated [available values: min, max or avg] | string | `"sum"` | no |
|
||||||
|
| terminated\_timeframe | Monitor timeframe for Pod terminated [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_10m"` | no |
|
||||||
|
|
||||||
## Outputs
|
## Outputs
|
||||||
|
|
||||||
@ -50,6 +58,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
|------|-------------|
|
|------|-------------|
|
||||||
| error\_id | id for monitor error |
|
| error\_id | id for monitor error |
|
||||||
| pod\_phase\_status\_id | id for monitor pod_phase_status |
|
| pod\_phase\_status\_id | id for monitor pod_phase_status |
|
||||||
|
| terminated\_id | id for monitor terminated |
|
||||||
|
|
||||||
## Related documentation
|
## Related documentation
|
||||||
|
|
||||||
|
|||||||
@ -110,3 +110,43 @@ variable "error_threshold_warning" {
|
|||||||
description = "error warning threshold"
|
description = "error warning threshold"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "terminated_enabled" {
|
||||||
|
description = "Flag to enable Pod terminated monitor"
|
||||||
|
type = string
|
||||||
|
default = "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "terminated_extra_tags" {
|
||||||
|
description = "Extra tags for Pod terminated monitor"
|
||||||
|
type = list(string)
|
||||||
|
default = []
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "terminated_message" {
|
||||||
|
description = "Custom message for Pod terminated monitor"
|
||||||
|
type = string
|
||||||
|
default = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "terminated_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for Pod terminated [available values: min, max or avg]"
|
||||||
|
type = string
|
||||||
|
default = "sum"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "terminated_timeframe" {
|
||||||
|
description = "Monitor timeframe for Pod terminated [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
|
type = string
|
||||||
|
default = "last_10m"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "terminated_threshold_critical" {
|
||||||
|
default = 0.5
|
||||||
|
description = "terminated critical threshold"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "terminated_threshold_warning" {
|
||||||
|
default = 0
|
||||||
|
description = "terminated warning threshold"
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@ -57,3 +57,33 @@ EOQ
|
|||||||
tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform"], var.error_extra_tags)
|
tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform"], var.error_extra_tags)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "terminated" {
|
||||||
|
count = var.terminated_enabled == "true" ? 1 : 0
|
||||||
|
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Pod terminated abnormally"
|
||||||
|
message = coalesce(var.terminated_message, var.message)
|
||||||
|
type = "query alert"
|
||||||
|
|
||||||
|
query = <<EOQ
|
||||||
|
${var.terminated_time_aggregator}(${var.terminated_timeframe}):
|
||||||
|
sum:kubernetes_state.container.status_report.count.terminated${module.filter-tags-nocontainercreating.query_alert} by {namespace,pod,reason}
|
||||||
|
> ${var.terminated_threshold_critical}
|
||||||
|
EOQ
|
||||||
|
|
||||||
|
thresholds = {
|
||||||
|
critical = var.terminated_threshold_critical
|
||||||
|
warning = var.terminated_threshold_warning
|
||||||
|
}
|
||||||
|
|
||||||
|
evaluation_delay = var.evaluation_delay
|
||||||
|
new_host_delay = var.new_host_delay
|
||||||
|
notify_no_data = false
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
|
||||||
|
tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform"], var.terminated_extra_tags)
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@ -8,3 +8,8 @@ output "error_id" {
|
|||||||
value = datadog_monitor.error.*.id
|
value = datadog_monitor.error.*.id
|
||||||
}
|
}
|
||||||
|
|
||||||
|
output "terminated_id" {
|
||||||
|
description = "id for monitor terminated"
|
||||||
|
value = datadog_monitor.terminated.*.id
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user