diff --git a/monitors-custom-cpu.tf b/monitors-custom-cpu.tf new file mode 100644 index 0000000..e856edb --- /dev/null +++ b/monitors-custom-cpu.tf @@ -0,0 +1,26 @@ +resource "datadog_monitor" "cpu_custom" { + name = "${var.dd_custom_cpu["name"]}" + message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.warning_escalation_group}\n{{/is_warning_recovery}}" + count = "${var.dd_custom_cpu["status"] == "enabled" ? 1 : 0}" + + query = "min(${var.dd_custom_cpu["period"]}):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu.monitoring:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > ${var.dd_custom_cpu["critical_threshold"]}" + type = "query alert" + + thresholds = { + warning = "${var.dd_custom_cpu["warning_threshold"]}" + critical = "${var.dd_custom_cpu["critical_threshold"]}" + } + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} + diff --git a/monitors-linux-basics.tf b/monitors-linux-basics.tf index 87f5144..addee9d 100644 --- a/monitors-linux-basics.tf +++ b/monitors-linux-basics.tf @@ -1,7 +1,7 @@ resource "datadog_monitor" "cpu_80_15min" { name = "CPU High > 80% for 15 min" - message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" - count = "${var.linux_basics == "enabled" ? 1 : 0}" + message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" query = "min(last_15m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > 80" type = "query alert" @@ -21,11 +21,11 @@ resource "datadog_monitor" "cpu_80_15min" { resource "datadog_monitor" "cpu_95_5min" { name = "CPU High > 95% for 5 min" - message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > 95" type = "query alert" - count = "${var.linux_basics == "enabled" ? 1 : 0}" + count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" notify_no_data = false renotify_interval = 60 @@ -42,11 +42,11 @@ resource "datadog_monitor" "cpu_95_5min" { resource "datadog_monitor" "datadog_free_disk_space_5" { name = "Free disk space < 5%" - message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 5" type = "query alert" - count = "${var.linux_basics == "enabled" ? 1 : 0}" + count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" notify_no_data = false renotify_interval = 60 @@ -63,11 +63,11 @@ resource "datadog_monitor" "datadog_free_disk_space_5" { resource "datadog_monitor" "datadog_free_disk_space_10" { name = "Free disk space < 10%" - message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.warning_escalation_group}\n{{/is_warning_recovery}}" query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 10" type = "query alert" - count = "${var.linux_basics == "enabled" ? 1 : 0}" + count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" thresholds { warning = 20 @@ -89,11 +89,11 @@ resource "datadog_monitor" "datadog_free_disk_space_10" { resource "datadog_monitor" "datadog_free_disk_space_inodes_5" { name = "Free disk inodes < 5%" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group} \n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group} \n{{/is_recovery}}" query = "sum(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 5" type = "query alert" - count = "${var.linux_basics == "enabled" ? 1 : 0}" + count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" notify_no_data = false renotify_interval = 60 @@ -110,11 +110,11 @@ resource "datadog_monitor" "datadog_free_disk_space_inodes_5" { resource "datadog_monitor" "datadog_free_disk_space_inodes_10" { name = "Free disk inodes < 10%" - message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.warning_escalation_group}\n{{/is_warning_recovery}}" query = "max(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 10" type = "query alert" - count = "${var.linux_basics == "enabled" ? 1 : 0}" + count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" thresholds { warning = 20 @@ -140,7 +140,7 @@ resource "datadog_monitor" "datadog_cpu_load" { query = "min(last_5m):avg:system.load.5{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {instance-id} / avg:gcp.gce.instance.cpu.reserved_cores{*} by {instance-id} > 2" type = "query alert" - count = "${var.linux_basics == "enabled" ? 1 : 0}" + count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" notify_no_data = false renotify_interval = 60 @@ -161,7 +161,7 @@ resource "datadog_monitor" "datadog_free_memory" { query = "sum(last_1m):avg:system.mem.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_memory:enabled} by {host} / avg:system.mem.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_memory:enabled} by {host} * 100 < 5" type = "query alert" - count = "${var.linux_basics == "enabled" ? 1 : 0}" + count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" notify_no_data = false renotify_interval = 60 @@ -178,11 +178,11 @@ resource "datadog_monitor" "datadog_free_memory" { resource "datadog_monitor" "datadog_host_unreachable" { name = "Host unreachable" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group} \n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group} \n{{/is_recovery}}" query = "\"datadog.agent.up\".over(\"dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled\").last(1).count_by_status()" type = "service check" - count = "${var.linux_basics == "enabled" ? 1 : 0}" + count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" notify_no_data = false renotify_interval = 60 diff --git a/testing/inputs.tf b/testing/inputs.tf new file mode 100644 index 0000000..55bf02e --- /dev/null +++ b/testing/inputs.tf @@ -0,0 +1,26 @@ +variable "critical_escalation_group" { + default = "@pagerduty_HODummy" +} +variable "warning_escalation_group" { + default = "@pagerduty_HNODummy" +} + +variable "datadog_app_key" {} +variable "datadog_api_key" {} + +variable "dd_linux_basics" { + default = "enabled" +} + +variable "dd_custom_cpu" { + type = "map" + default = { + status = "enabled" + name = "CPU High > 95% during 1 hour" + + period = "last_1h" + + critical_threshold = 95 + warning_threshold = 90 + } +} diff --git a/testing/main.tf b/testing/main.tf new file mode 100644 index 0000000..96b9614 --- /dev/null +++ b/testing/main.tf @@ -0,0 +1,4 @@ +provider "datadog" { + api_key = "${var.datadog_api_key}" + app_key = "${var.datadog_app_key}" +}