From bb9839f390d8919f2fef9aa2244ad735fcb21365 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Tue, 21 Aug 2018 13:50:01 +0200 Subject: [PATCH 1/3] MON-286 add host unreachable monitor to system generic --- system/generic/README.md | 5 +++++ system/generic/inputs.tf | 20 +++++++++++++++++++- system/generic/monitors-system.tf | 27 +++++++++++++++++++++++++++ system/generic/outputs.tf | 5 +++++ 4 files changed, 56 insertions(+), 1 deletion(-) diff --git a/system/generic/README.md b/system/generic/README.md index b3b450c..ea6c8d4 100644 --- a/system/generic/README.md +++ b/system/generic/README.md @@ -16,6 +16,7 @@ module "datadog-monitors-system-generic" { Creates DataDog monitors with the following checks: +- Host unreachable - CPU usage - CPU load 5 - Free disk space @@ -81,6 +82,9 @@ Creates DataDog monitors with the following checks: | free_memory_timeframe | Monitor timeframe for Free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | message | Message sent when an alert is triggered | string | - | yes | | new_host_delay | Delay in seconds before monitor new resource | string | `300` | no | +| unreachable_extra_tags | Extra tags for Host unreachable monitor | list | `` | no | +| unreachable_message | Custom message for Host unreachable monitor | string | `` | no | +| unreachable_silenced | Groups to mute for Host unreachable monitor | map | `` | no | ## Outputs @@ -91,6 +95,7 @@ Creates DataDog monitors with the following checks: | datadog_free_disk_space_inodes_too_low_id | id for monitor datadog_free_disk_space_inodes_too_low | | datadog_free_disk_space_too_low_id | id for monitor datadog_free_disk_space_too_low | | datadog_free_memory_id | id for monitor datadog_free_memory | +| datadog_host_unreachable_id | id for monitor datadog_host_unreachable | | datadog_load_too_high_id | id for monitor datadog_load_too_high | ## Related documentation diff --git a/system/generic/inputs.tf b/system/generic/inputs.tf index 87b1bd8..249a6b7 100644 --- a/system/generic/inputs.tf +++ b/system/generic/inputs.tf @@ -29,7 +29,25 @@ variable "filter_tags_custom" { default = "*" } -# Custom CPU instance specific +# System generic specific + +variable "unreachable_silenced" { + description = "Groups to mute for Host unreachable monitor" + type = "map" + default = {} +} + +variable "unreachable_extra_tags" { + description = "Extra tags for Host unreachable monitor" + type = "list" + default = [] +} + +variable "unreachable_message" { + description = "Custom message for Host unreachable monitor" + type = "string" + default = "" +} variable "cpu_high_silenced" { description = "Groups to mute for CPU high monitor" diff --git a/system/generic/monitors-system.tf b/system/generic/monitors-system.tf index 89db0c4..f113d8d 100644 --- a/system/generic/monitors-system.tf +++ b/system/generic/monitors-system.tf @@ -1,3 +1,30 @@ +resource "datadog_monitor" "datadog_host_unreachable" { + name = "[${var.environment}] Host unreachable" + message = "${coalesce(var.unreachable_message, var.message)}" + + query = "\"datadog.agent.up\".over${module.filter-tags.service_check}.last(6).count_by_status()" + + type = "service check" + + thresholds { + ok = 1 + warning = 1 + critical = 5 + } + + notify_no_data = true + new_host_delay = "${var.new_host_delay}" + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + + silenced = "${var.unreachable_silenced}" + + tags = ["env:${var.environment}", "type:system", "provider:host", "resource:generic", "team:claranet", "created-by:terraform", "${var.unreachable_extra_tags}"] +} + resource "datadog_monitor" "datadog_cpu_too_high" { name = "[${var.environment}] CPU usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = "${coalesce(var.cpu_high_message, var.message)}" diff --git a/system/generic/outputs.tf b/system/generic/outputs.tf index 85492d8..e870cda 100644 --- a/system/generic/outputs.tf +++ b/system/generic/outputs.tf @@ -1,3 +1,8 @@ +output "datadog_host_unreachable_id" { + description = "id for monitor datadog_host_unreachable" + value = "${datadog_monitor.datadog_host_unreachable.id}" +} + output "datadog_cpu_too_high_id" { description = "id for monitor datadog_cpu_too_high" value = "${datadog_monitor.datadog_cpu_too_high.id}" From cf48c3aa98ffa13e8df0e3cd8e6db849695fc9aa Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Tue, 21 Aug 2018 15:26:05 +0200 Subject: [PATCH 2/3] MON-281 use system instead of generic which is more eloquent --- system/generic/modules.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/system/generic/modules.tf b/system/generic/modules.tf index a62a76a..0bc98e5 100644 --- a/system/generic/modules.tf +++ b/system/generic/modules.tf @@ -2,7 +2,7 @@ module "filter-tags" { source = "../../common/filter-tags" environment = "${var.environment}" - resource = "generic" + resource = "system" filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" } @@ -11,7 +11,7 @@ module "filter-tags-disk" { source = "../../common/filter-tags" environment = "${var.environment}" - resource = "generic" + resource = "system" filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" extra_tags = ["dd_disk:enabled"] From 8b9eaaae2d8a4a332d4d6097559f5982eb0d2121 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Tue, 21 Aug 2018 15:52:55 +0200 Subject: [PATCH 3/3] MON-281 add variable for no_data_timeframe --- system/generic/README.md | 1 + system/generic/inputs.tf | 6 ++++++ system/generic/monitors-system.tf | 1 + 3 files changed, 8 insertions(+) diff --git a/system/generic/README.md b/system/generic/README.md index ea6c8d4..b812152 100644 --- a/system/generic/README.md +++ b/system/generic/README.md @@ -84,6 +84,7 @@ Creates DataDog monitors with the following checks: | new_host_delay | Delay in seconds before monitor new resource | string | `300` | no | | unreachable_extra_tags | Extra tags for Host unreachable monitor | list | `` | no | | unreachable_message | Custom message for Host unreachable monitor | string | `` | no | +| unreachable_no_data_timeframe | Timeframe for Host unreachable monitor to alert on no data | string | `2` | no | | unreachable_silenced | Groups to mute for Host unreachable monitor | map | `` | no | ## Outputs diff --git a/system/generic/inputs.tf b/system/generic/inputs.tf index 249a6b7..c67016d 100644 --- a/system/generic/inputs.tf +++ b/system/generic/inputs.tf @@ -49,6 +49,12 @@ variable "unreachable_message" { default = "" } +variable "unreachable_no_data_timeframe" { + description = "Timeframe for Host unreachable monitor to alert on no data" + type = "string" + default = 2 +} + variable "cpu_high_silenced" { description = "Groups to mute for CPU high monitor" type = "map" diff --git a/system/generic/monitors-system.tf b/system/generic/monitors-system.tf index f113d8d..ecbe419 100644 --- a/system/generic/monitors-system.tf +++ b/system/generic/monitors-system.tf @@ -19,6 +19,7 @@ resource "datadog_monitor" "datadog_host_unreachable" { include_tags = true locked = false require_full_window = true + no_data_timeframe = "${var.unreachable_no_data_timeframe}" silenced = "${var.unreachable_silenced}"