133 lines
4.9 KiB
HCL
133 lines
4.9 KiB
HCL
resource "datadog_monitor" "kubernetes_cluster_cpu" {
|
|
name = "Kubernetes cluster CPU High > 85%"
|
|
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
|
|
query = "avg(last_5m):avg:system.cpu.system{*} by {cluster-name} + avg:system.cpu.user{*} by {cluster-name} > 85"
|
|
|
|
thresholds {
|
|
warning = 75
|
|
critical = 85
|
|
}
|
|
|
|
type = "query alert"
|
|
notify_no_data = false
|
|
renotify_interval = 60
|
|
notify_audit = false
|
|
timeout_h = 0
|
|
include_tags = true
|
|
locked = false
|
|
require_full_window = true
|
|
new_host_delay = 300
|
|
notify_no_data = false
|
|
renotify_interval = 0
|
|
no_data_timeframe = 20
|
|
}
|
|
|
|
resource "datadog_monitor" "kubernetes_kubelet_check" {
|
|
name = "Kubernetes kubelet check down"
|
|
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
|
|
query = "\"kubernetes.kubelet.check\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()"
|
|
|
|
thresholds {
|
|
warning = 0
|
|
critical = 10
|
|
}
|
|
|
|
type = "service check"
|
|
notify_no_data = false
|
|
renotify_interval = 60
|
|
notify_audit = false
|
|
timeout_h = 0
|
|
include_tags = true
|
|
locked = false
|
|
require_full_window = true
|
|
new_host_delay = 300
|
|
notify_no_data = false
|
|
renotify_interval = 0
|
|
no_data_timeframe = 20
|
|
}
|
|
|
|
resource "datadog_monitor" "kubernetes_kubelet_ping" {
|
|
name = "Kubernetes kubelet ping not ok"
|
|
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
|
|
|
|
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
|
|
query = "\"kubernetes.kubelet.check.ping\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()"
|
|
|
|
thresholds {
|
|
warning = 0
|
|
critical = 10
|
|
}
|
|
|
|
type = "service check"
|
|
notify_no_data = false
|
|
renotify_interval = 60
|
|
notify_audit = false
|
|
timeout_h = 0
|
|
include_tags = true
|
|
locked = false
|
|
require_full_window = true
|
|
new_host_delay = 300
|
|
notify_no_data = false
|
|
renotify_interval = 0
|
|
no_data_timeframe = 20
|
|
}
|
|
|
|
resource "datadog_monitor" "kubernetes_pods_unavailable" {
|
|
name = "Kubernetes pods unavailable"
|
|
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
|
|
query = "max(last_5m):avg:kubernetes_state.deployment.replicas_desired{!namespace:cronetes} by {cluster-name,namespace,deployment} - avg:kubernetes_state.deployment.replicas_unavailable{!namespace:cronetes} by {cluster-name,namespace,deployment} + 1 < 1"
|
|
type = "query alert"
|
|
|
|
notify_no_data = false
|
|
renotify_interval = 60
|
|
notify_audit = false
|
|
timeout_h = 0
|
|
include_tags = true
|
|
locked = false
|
|
require_full_window = true
|
|
new_host_delay = 300
|
|
notify_no_data = false
|
|
renotify_interval = 0
|
|
no_data_timeframe = 20
|
|
}
|
|
|
|
resource "datadog_monitor" "kubernetes_node_status" {
|
|
name = "Kubernetes node status"
|
|
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
|
|
query = "max(last_5m):avg:kubernetes_state.node.status{!namespace:cronetes} by {cluster-name,namespace,deployment} <= 0"
|
|
type = "metric alert"
|
|
|
|
notify_no_data = false
|
|
renotify_interval = 60
|
|
notify_audit = false
|
|
timeout_h = 0
|
|
include_tags = true
|
|
locked = false
|
|
require_full_window = true
|
|
new_host_delay = 300
|
|
notify_no_data = false
|
|
renotify_interval = 0
|
|
no_data_timeframe = 20
|
|
}
|
|
|
|
/* type = "query alert"
|
|
|
|
thresholds {
|
|
# warning = 75
|
|
critical = 80
|
|
}
|
|
|
|
notify_no_data = false
|
|
renotify_interval = 60
|
|
notify_audit = false
|
|
timeout_h = 0
|
|
include_tags = true
|
|
locked = false
|
|
require_full_window = true
|
|
new_host_delay = 300
|
|
notify_no_data = false
|
|
renotify_interval = 0
|
|
no_data_timeframe = 20
|
|
}*/
|
|
|