diff --git a/monitors-linux-basics.tf b/monitors-linux-basics.tf index 223c6ef..1ed76a8 100644 --- a/monitors-linux-basics.tf +++ b/monitors-linux-basics.tf @@ -3,7 +3,7 @@ resource "datadog_monitor" "cpu_80_15min" { message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" - query = "min(last_15m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region, stack} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region, stack} > ${var.cpu_15_critical}" + query = "min(last_15m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,stack} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,stack} > ${var.cpu_15_critical}" type = "query alert" thresholds { @@ -26,7 +26,7 @@ resource "datadog_monitor" "cpu_95_5min" { name = "[${var.env}] CPU High > ${var.cpu_5_critical} for 5 min on {{host.name}}" message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" - query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,stack} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region, stack} > ${var.cpu_5_critical}" + query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,stack} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,stack} > ${var.cpu_5_critical}" type = "query alert" count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" @@ -50,7 +50,7 @@ resource "datadog_monitor" "datadog_free_disk_space_5" { name = "[${var.env}] Free disk space < 5% on {{host.name}}" message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" - query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region, device,stack} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region, device,stack} * 100 < 5" + query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device,stack} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device,stack} * 100 < 5" type = "query alert" count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" @@ -74,7 +74,7 @@ resource "datadog_monitor" "datadog_free_disk_space_10" { name = "[${var.env}] Free disk space < 10% on {{host.name}}" message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" - query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device,stack} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region, device,stack} * 100 < 10" + query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device,stack} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device,stack} * 100 < 10" type = "query alert" count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" @@ -99,7 +99,7 @@ resource "datadog_monitor" "datadog_free_disk_space_inodes_5" { name = "[${var.env}] Free disk inodes < 5% on {{host.name}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group} \n{{/is_recovery}}" - query = "sum(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region, device,stack} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region, device,stack} * 100 < 5" + query = "sum(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device,stack} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device,stack} * 100 < 5" type = "query alert" count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"