add custom cpu stable
This commit is contained in:
parent
47bcedf331
commit
7aabbea18c
26
monitors-custom-cpu.tf
Normal file
26
monitors-custom-cpu.tf
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
resource "datadog_monitor" "cpu_custom" {
|
||||||
|
name = "${var.dd_custom_cpu["name"]}"
|
||||||
|
message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.warning_escalation_group}\n{{/is_warning_recovery}}"
|
||||||
|
count = "${var.dd_custom_cpu["status"] == "enabled" ? 1 : 0}"
|
||||||
|
|
||||||
|
query = "min(${var.dd_custom_cpu["period"]}):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu.monitoring:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > ${var.dd_custom_cpu["critical_threshold"]}"
|
||||||
|
type = "query alert"
|
||||||
|
|
||||||
|
thresholds = {
|
||||||
|
warning = "${var.dd_custom_cpu["warning_threshold"]}"
|
||||||
|
critical = "${var.dd_custom_cpu["critical_threshold"]}"
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
renotify_interval = 60
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
new_host_delay = 300
|
||||||
|
notify_no_data = false
|
||||||
|
renotify_interval = 0
|
||||||
|
no_data_timeframe = 20
|
||||||
|
}
|
||||||
|
|
||||||
@ -1,7 +1,7 @@
|
|||||||
resource "datadog_monitor" "cpu_80_15min" {
|
resource "datadog_monitor" "cpu_80_15min" {
|
||||||
name = "CPU High > 80% for 15 min"
|
name = "CPU High > 80% for 15 min"
|
||||||
message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
|
message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
|
||||||
count = "${var.linux_basics == "enabled" ? 1 : 0}"
|
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
|
||||||
|
|
||||||
query = "min(last_15m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > 80"
|
query = "min(last_15m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > 80"
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
@ -21,11 +21,11 @@ resource "datadog_monitor" "cpu_80_15min" {
|
|||||||
|
|
||||||
resource "datadog_monitor" "cpu_95_5min" {
|
resource "datadog_monitor" "cpu_95_5min" {
|
||||||
name = "CPU High > 95% for 5 min"
|
name = "CPU High > 95% for 5 min"
|
||||||
message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
|
message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
|
||||||
|
|
||||||
query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > 95"
|
query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > 95"
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
count = "${var.linux_basics == "enabled" ? 1 : 0}"
|
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
|
||||||
|
|
||||||
notify_no_data = false
|
notify_no_data = false
|
||||||
renotify_interval = 60
|
renotify_interval = 60
|
||||||
@ -42,11 +42,11 @@ resource "datadog_monitor" "cpu_95_5min" {
|
|||||||
|
|
||||||
resource "datadog_monitor" "datadog_free_disk_space_5" {
|
resource "datadog_monitor" "datadog_free_disk_space_5" {
|
||||||
name = "Free disk space < 5%"
|
name = "Free disk space < 5%"
|
||||||
message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
|
message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}"
|
||||||
|
|
||||||
query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 5"
|
query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 5"
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
count = "${var.linux_basics == "enabled" ? 1 : 0}"
|
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
|
||||||
|
|
||||||
notify_no_data = false
|
notify_no_data = false
|
||||||
renotify_interval = 60
|
renotify_interval = 60
|
||||||
@ -63,11 +63,11 @@ resource "datadog_monitor" "datadog_free_disk_space_5" {
|
|||||||
|
|
||||||
resource "datadog_monitor" "datadog_free_disk_space_10" {
|
resource "datadog_monitor" "datadog_free_disk_space_10" {
|
||||||
name = "Free disk space < 10%"
|
name = "Free disk space < 10%"
|
||||||
message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}"
|
message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.warning_escalation_group}\n{{/is_warning_recovery}}"
|
||||||
|
|
||||||
query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 10"
|
query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 10"
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
count = "${var.linux_basics == "enabled" ? 1 : 0}"
|
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = 20
|
warning = 20
|
||||||
@ -89,11 +89,11 @@ resource "datadog_monitor" "datadog_free_disk_space_10" {
|
|||||||
|
|
||||||
resource "datadog_monitor" "datadog_free_disk_space_inodes_5" {
|
resource "datadog_monitor" "datadog_free_disk_space_inodes_5" {
|
||||||
name = "Free disk inodes < 5%"
|
name = "Free disk inodes < 5%"
|
||||||
message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group} \n{{/is_recovery}}"
|
message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group} \n{{/is_recovery}}"
|
||||||
|
|
||||||
query = "sum(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 5"
|
query = "sum(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 5"
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
count = "${var.linux_basics == "enabled" ? 1 : 0}"
|
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
|
||||||
|
|
||||||
notify_no_data = false
|
notify_no_data = false
|
||||||
renotify_interval = 60
|
renotify_interval = 60
|
||||||
@ -110,11 +110,11 @@ resource "datadog_monitor" "datadog_free_disk_space_inodes_5" {
|
|||||||
|
|
||||||
resource "datadog_monitor" "datadog_free_disk_space_inodes_10" {
|
resource "datadog_monitor" "datadog_free_disk_space_inodes_10" {
|
||||||
name = "Free disk inodes < 10%"
|
name = "Free disk inodes < 10%"
|
||||||
message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}"
|
message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.warning_escalation_group}\n{{/is_warning_recovery}}"
|
||||||
|
|
||||||
query = "max(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 10"
|
query = "max(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 10"
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
count = "${var.linux_basics == "enabled" ? 1 : 0}"
|
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = 20
|
warning = 20
|
||||||
@ -140,7 +140,7 @@ resource "datadog_monitor" "datadog_cpu_load" {
|
|||||||
|
|
||||||
query = "min(last_5m):avg:system.load.5{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {instance-id} / avg:gcp.gce.instance.cpu.reserved_cores{*} by {instance-id} > 2"
|
query = "min(last_5m):avg:system.load.5{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {instance-id} / avg:gcp.gce.instance.cpu.reserved_cores{*} by {instance-id} > 2"
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
count = "${var.linux_basics == "enabled" ? 1 : 0}"
|
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
|
||||||
|
|
||||||
notify_no_data = false
|
notify_no_data = false
|
||||||
renotify_interval = 60
|
renotify_interval = 60
|
||||||
@ -161,7 +161,7 @@ resource "datadog_monitor" "datadog_free_memory" {
|
|||||||
|
|
||||||
query = "sum(last_1m):avg:system.mem.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_memory:enabled} by {host} / avg:system.mem.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_memory:enabled} by {host} * 100 < 5"
|
query = "sum(last_1m):avg:system.mem.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_memory:enabled} by {host} / avg:system.mem.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_memory:enabled} by {host} * 100 < 5"
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
count = "${var.linux_basics == "enabled" ? 1 : 0}"
|
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
|
||||||
|
|
||||||
notify_no_data = false
|
notify_no_data = false
|
||||||
renotify_interval = 60
|
renotify_interval = 60
|
||||||
@ -178,11 +178,11 @@ resource "datadog_monitor" "datadog_free_memory" {
|
|||||||
|
|
||||||
resource "datadog_monitor" "datadog_host_unreachable" {
|
resource "datadog_monitor" "datadog_host_unreachable" {
|
||||||
name = "Host unreachable"
|
name = "Host unreachable"
|
||||||
message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group} \n{{/is_recovery}}"
|
message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group} \n{{/is_recovery}}"
|
||||||
|
|
||||||
query = "\"datadog.agent.up\".over(\"dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled\").last(1).count_by_status()"
|
query = "\"datadog.agent.up\".over(\"dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled\").last(1).count_by_status()"
|
||||||
type = "service check"
|
type = "service check"
|
||||||
count = "${var.linux_basics == "enabled" ? 1 : 0}"
|
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"
|
||||||
|
|
||||||
notify_no_data = false
|
notify_no_data = false
|
||||||
renotify_interval = 60
|
renotify_interval = 60
|
||||||
|
|||||||
26
testing/inputs.tf
Normal file
26
testing/inputs.tf
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
variable "critical_escalation_group" {
|
||||||
|
default = "@pagerduty_HODummy"
|
||||||
|
}
|
||||||
|
variable "warning_escalation_group" {
|
||||||
|
default = "@pagerduty_HNODummy"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "datadog_app_key" {}
|
||||||
|
variable "datadog_api_key" {}
|
||||||
|
|
||||||
|
variable "dd_linux_basics" {
|
||||||
|
default = "enabled"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "dd_custom_cpu" {
|
||||||
|
type = "map"
|
||||||
|
default = {
|
||||||
|
status = "enabled"
|
||||||
|
name = "CPU High > 95% during 1 hour"
|
||||||
|
|
||||||
|
period = "last_1h"
|
||||||
|
|
||||||
|
critical_threshold = 95
|
||||||
|
warning_threshold = 90
|
||||||
|
}
|
||||||
|
}
|
||||||
4
testing/main.tf
Normal file
4
testing/main.tf
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
provider "datadog" {
|
||||||
|
api_key = "${var.datadog_api_key}"
|
||||||
|
app_key = "${var.datadog_app_key}"
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user