diff --git a/system/generic/README.md b/system/generic/README.md index f5faf90..e56af90 100644 --- a/system/generic/README.md +++ b/system/generic/README.md @@ -17,7 +17,11 @@ Purpose ------- Creates a DataDog monitors with the following checks : -* System CPU High +* CPU usage +* CPU load ratio +* Free memory +* Free disk inodes +* Free disk space Inputs ------ @@ -29,6 +33,11 @@ Inputs | cpu_high_threshold_critical | CPU high critical threshold | string | `95` | no | | cpu_high_threshold_warning | CPU high warning threshold | string | `80` | no | | cpu_high_timeframe | CPU high timeframe | string | `last_5m` | no | +| cpu_load_message | Custom message for CPU load ratio monitor | string | `` | no | +| cpu_load_silenced | Groups to mute for CPU load ratio monitor | map | `` | no | +| cpu_load_threshold_critical | CPU load ratio critical threshold | string | `4` | no | +| cpu_load_threshold_warning | CPU load ratio warning threshold | string | `3` | no | +| cpu_load_timeframe | CPU load timeframe | string | `last_5m` | no | | environment | Architecture Environment | string | - | yes | | delay | Delay in seconds for the metric evaluation | string | `15` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | diff --git a/system/generic/inputs.tf b/system/generic/inputs.tf index 6d3477a..8a77608 100644 --- a/system/generic/inputs.tf +++ b/system/generic/inputs.tf @@ -53,6 +53,33 @@ variable "cpu_high_threshold_critical" { default = 95 } +variable "cpu_load_silenced" { + description = "Groups to mute for CPU load ratio monitor" + type = "map" + default = {} +} + +variable "cpu_load_message" { + description = "Custom message for CPU load ratio monitor" + type = "string" + default = "" +} + +variable "cpu_load_timeframe" { + description = "CPU load ratio timeframe" + default = "last_5m" +} + +variable "cpu_load_threshold_warning" { + description = "CPU load ratio warning threshold" + default = 3 +} + +variable "cpu_load_threshold_critical" { + description = "CPU load ratio critical threshold" + default = 4 +} + variable "free_disk_space_silenced" { description = "Groups to mute for Free diskspace monitor" type = "map" diff --git a/system/generic/monitors-system.tf b/system/generic/monitors-system.tf index caaef3b..9fcf9cc 100644 --- a/system/generic/monitors-system.tf +++ b/system/generic/monitors-system.tf @@ -38,6 +38,39 @@ resource "datadog_monitor" "datadog_cpu_too_high" { silenced = "${var.cpu_high_silenced}" } +resource "datadog_monitor" "datadog_load_too_high" { + name = "[${var.environment}] CPU load 5 {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.cpu_load_message, var.message)}" + + query = < ${var.cpu_load_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.cpu_load_threshold_warning}" + critical = "${var.cpu_load_threshold_critical}" + } + + tags = ["env:${var.environment}", "type:system", "resource:load"] + + notify_no_data = true + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + no_data_timeframe = 20 + + silenced = "${var.cpu_load_silenced}" +} + resource "datadog_monitor" "datadog_free_disk_space_too_low" { name = "[${var.environment}] Free disk space {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = "${coalesce(var.free_disk_space_message, var.message)}"