From 11edd921deae40a5701c3eb40ee7fed902a322f0 Mon Sep 17 00:00:00 2001 From: vincent EL KHATIB Date: Tue, 12 Sep 2017 15:03:00 +0200 Subject: [PATCH] rebase from master --- datadog-samples/inputs-declaration.sample | 11 +- datadog-samples/modules-declaration.sample | 12 +- inputs.tf | 44 +++++-- monitors-custom-cpu.tf | 7 +- monitors-linux-basics.tf | 137 ++++++++++----------- monitors-rds_mysql-basics.tf | 37 +++--- testing/inputs.tf | 20 ++- testing/modules.tf | 12 +- 8 files changed, 152 insertions(+), 128 deletions(-) diff --git a/datadog-samples/inputs-declaration.sample b/datadog-samples/inputs-declaration.sample index 55bf02e..a82030e 100644 --- a/datadog-samples/inputs-declaration.sample +++ b/datadog-samples/inputs-declaration.sample @@ -1,3 +1,7 @@ +variable environment {} + +variable region {} + variable "critical_escalation_group" { default = "@pagerduty_HODummy" } @@ -5,13 +9,18 @@ variable "warning_escalation_group" { default = "@pagerduty_HNODummy" } -variable "datadog_app_key" {} variable "datadog_api_key" {} +variable "datadog_app_key" {} variable "dd_linux_basics" { default = "enabled" } +variable "dd_aws_rds" { + default = "enabled" +} + + variable "dd_custom_cpu" { type = "map" default = { diff --git a/datadog-samples/modules-declaration.sample b/datadog-samples/modules-declaration.sample index 036cb16..5574a18 100644 --- a/datadog-samples/modules-declaration.sample +++ b/datadog-samples/modules-declaration.sample @@ -1,13 +1,15 @@ module "datadog-monitors" { source = "git::ssh://git@bitbucket.org/morea/terraform.datadog.monitors.git" + env = "${var.environment}" + region = "${var.region}" + critical_escalation_group = "${var.critical_escalation_group}" warning_escalation_group = "${var.warning_escalation_group}" - #default monitors templates integrations examples - dd_linux_basics = "${var.dd_linux_basics}" - #nginx = "false" - #aws_rds_mysql = "false" - dd_custom_cpu = "${var.dd_custom_cpu}" + dd_aws_rds = "${var.dd_aws_rds}" + dd_linux_basics = "${var.dd_linux_basics}" + } + diff --git a/inputs.tf b/inputs.tf index 00ca421..d07327d 100644 --- a/inputs.tf +++ b/inputs.tf @@ -1,31 +1,55 @@ variable "critical_escalation_group" {} variable "warning_escalation_group" {} +variable env {} +variable region {} + + +##linux variable "dd_linux_basics" { default = "disabled" } - +variable "linux_basics_config" { + type = "map" + default = { + notify_no_data = false + delay = 900 + } +} variable "dd_custom_cpu" { type = "map" default = { status = "disabled" } } -variable "dd_custom_memory" { - type = "map" - default = { - status = "disabled" - } -} -variable "dd_rds_mysql_basics" { +## RDS +variable "dd_aws_rds" { default = "disabled" } -variable "dd_custom_rds-mysql" { +variable "rds_config" { type = "map" default = { - status = "disabled" + notify_no_data = false + delay = 900 + } +} +variable "rds_cpu_threshold" { + type = "map" + default = { + warning = 80 + critical = 90 + } +} +variable "rds_mem_threshold" { + default = { + warning = 20 + critical = 10 } } + + + + diff --git a/monitors-custom-cpu.tf b/monitors-custom-cpu.tf index e856edb..a29d3af 100644 --- a/monitors-custom-cpu.tf +++ b/monitors-custom-cpu.tf @@ -11,16 +11,15 @@ resource "datadog_monitor" "cpu_custom" { critical = "${var.dd_custom_cpu["critical_threshold"]}" } - notify_no_data = false + notify_no_data = "${var.linux_basics_config["notify_no_data"]}" + evaluation_delay = "${var.linux_basics_config["delay"]}" + new_host_delay = "${var.linux_basics_config["delay"]}" renotify_interval = 60 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 no_data_timeframe = 20 } diff --git a/monitors-linux-basics.tf b/monitors-linux-basics.tf index c58ddc7..07a5a33 100644 --- a/monitors-linux-basics.tf +++ b/monitors-linux-basics.tf @@ -1,71 +1,68 @@ resource "datadog_monitor" "cpu_80_15min" { - name = "CPU High > 80% for 15 min" + name = "[${var.env}] CPU High > 80% for 15 min on {{host.name}}" message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" - query = "min(last_15m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > 80" + query = "min(last_15m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,stack} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,stack} > 80" type = "query alert" - notify_no_data = false + notify_no_data = "${var.linux_basics_config["notify_no_data"]}" + evaluation_delay = "${var.linux_basics_config["delay"]}" + new_host_delay = "${var.linux_basics_config["delay"]}" renotify_interval = 60 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 no_data_timeframe = 20 } resource "datadog_monitor" "cpu_95_5min" { - name = "CPU High > 95% for 5 min" + name = "[${var.env}] CPU High > 95% for 5 min on {{host.name}}" message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" - query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabledd} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > 95" + query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,stack} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,region,stack} > 95" type = "query alert" count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" - notify_no_data = false + notify_no_data = "${var.linux_basics_config["notify_no_data"]}" + evaluation_delay = "${var.linux_basics_config["delay"]}" + new_host_delay = "${var.linux_basics_config["delay"]}" renotify_interval = 60 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 no_data_timeframe = 20 } resource "datadog_monitor" "datadog_free_disk_space_5" { - name = "Free disk space < 5%" + name = "[${var.env}] Free disk space < 5% on {{host.name}}" message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" - query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 5" + query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,device,region,stack} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device,region,stack} * 100 < 5" type = "query alert" count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" - notify_no_data = false + notify_no_data = "${var.linux_basics_config["notify_no_data"]}" + evaluation_delay = "${var.linux_basics_config["delay"]}" + new_host_delay = "${var.linux_basics_config["delay"]}" renotify_interval = 60 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 no_data_timeframe = 20 } resource "datadog_monitor" "datadog_free_disk_space_10" { - name = "Free disk space < 10%" + name = "[${var.env}] Free disk space < 10% on {{host.name}}" message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.warning_escalation_group}\n{{/is_warning_recovery}}" - query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 10" + query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,device,region,stack} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,device,region,stack} * 100 < 10" type = "query alert" count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" @@ -74,45 +71,43 @@ resource "datadog_monitor" "datadog_free_disk_space_10" { critical = 10 } - notify_no_data = false + notify_no_data = "${var.linux_basics_config["notify_no_data"]}" + evaluation_delay = "${var.linux_basics_config["delay"]}" + new_host_delay = "${var.linux_basics_config["delay"]}" renotify_interval = 60 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 no_data_timeframe = 20 } resource "datadog_monitor" "datadog_free_disk_space_inodes_5" { - name = "Free disk inodes < 5%" + name = "[${var.env}] Free disk inodes < 5% on {{host.name}}" message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group} \n{{/is_recovery}}" - query = "sum(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 5" + query = "sum(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,device,region,stack} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,device,region,stack} * 100 < 5" type = "query alert" count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" - notify_no_data = false + notify_no_data = "${var.linux_basics_config["notify_no_data"]}" + evaluation_delay = "${var.linux_basics_config["delay"]}" + new_host_delay = "${var.linux_basics_config["delay"]}" renotify_interval = 60 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 no_data_timeframe = 20 } resource "datadog_monitor" "datadog_free_disk_space_inodes_10" { - name = "Free disk inodes < 10%" + name = "[${var.env}] Free disk inodes < 10% on {{host.name}}" message = "{{#is_alert}}\n${var.critical_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.warning_escalation_group}\n{{/is_warning_recovery}}" - query = "max(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 10" + query = "max(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,device,region} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,device,region,stack} * 100 < 10" type = "query alert" count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" @@ -121,16 +116,15 @@ resource "datadog_monitor" "datadog_free_disk_space_inodes_10" { critical = 10 } - notify_no_data = false + notify_no_data = "${var.linux_basics_config["notify_no_data"]}" + evaluation_delay = "${var.linux_basics_config["delay"]}" + new_host_delay = "${var.linux_basics_config["delay"]}" renotify_interval = 60 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 no_data_timeframe = 20 } @@ -142,57 +136,56 @@ resource "datadog_monitor" "datadog_free_disk_space_inodes_10" { # type = "query alert" # count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" -# notify_no_data = false -# renotify_interval = 60 -# notify_audit = false -# timeout_h = 0 -# include_tags = true -# locked = false -# require_full_window = true -# new_host_delay = 300 -# notify_no_data = false -# renotify_interval = 0 -# no_data_timeframe = 20 -#} + + # notify_no_data = "${var.linux_basics_config["notify_no_data"]}" + # evaluation_delay = "${var.linux_basics_config["delay"]}" + # new_host_delay = "${var.linux_basics_config["delay"]}" + # renotify_interval = 60 + # notify_audit = false + # timeout_h = 0 + # include_tags = true + # locked = false + # require_full_window = true + # no_data_timeframe = 20 +# } + resource "datadog_monitor" "datadog_free_memory" { - name = "Free memory < 5%" + name = "[${var.env}] Free memory < 5% on {{host.name}}" message = "Debugging alert - no escalation" - query = "sum(last_1m):avg:system.mem.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_memory:enabled} by {host} / avg:system.mem.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_memory:enabled} by {host} * 100 < 5" + query = "sum(last_1m):avg:system.mem.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_memory:enabled} by {host,region,stack} / avg:system.mem.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_memory:enabled} by {host,region,stack} * 100 < 5" type = "query alert" count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" - notify_no_data = false + notify_no_data = "${var.linux_basics_config["notify_no_data"]}" + evaluation_delay = "${var.linux_basics_config["delay"]}" + new_host_delay = "${var.linux_basics_config["delay"]}" renotify_interval = 60 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 no_data_timeframe = 20 } -resource "datadog_monitor" "datadog_host_unreachable" { - name = "Host unreachable" - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group} \n{{/is_recovery}}" +# resource "datadog_monitor" "datadog_host_unreachable" { +# name = "Host unreachable" +# message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group} \n{{/is_recovery}}" - query = "\"datadog.agent.up\".over(\"dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled\").last(1).count_by_status()" - type = "service check" - count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" +# query = "datadog.agent.up.over(dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled).last(1).count_by_status()" +# type = "service check" +# count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - notify_no_data = true - renotify_interval = 0 - no_data_timeframe = 20 -} + # notify_no_data = "${var.linux_basics_config["notify_no_data"]}" + # evaluation_delay = "${var.linux_basics_config["delay"]}" + # new_host_delay = "${var.linux_basics_config["delay"]}" + # renotify_interval = 60 + # notify_audit = false + # timeout_h = 0 + # include_tags = true + # locked = false + # require_full_window = true + # no_data_timeframe = 20 +# } diff --git a/monitors-rds_mysql-basics.tf b/monitors-rds_mysql-basics.tf index 6d45170..90b15af 100644 --- a/monitors-rds_mysql-basics.tf +++ b/monitors-rds_mysql-basics.tf @@ -2,56 +2,55 @@ resource "datadog_monitor" "rds-mysql_cpu_80_15min" { - name = "RDS Mysql CPU High > 90% for 15 min" + name = "[${var.env}] rds Cpu high > 90% for 15 min on {{host.identifier}}" message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_escalation_group}\n{{/is_warning_recovery}}" - count = "${var.dd_rds_mysql_basics == "enabled" ? 1 : 0 }" + count = "${var.dd_aws_rds == "enabled" ? 1 : 0 }" - query = "avg(last_15m):aws.rds.cpuutilization.total{\"dd_monitoring:enabled\",\"dd_rds-mysql_basics:enabled\",\"!dd_custom_rds-mysql:enabled\"} by {host} > 90" + + query = "avg(last_15m):avg:aws.rds.cpuutilization{dd_monitoring:enabled,dd_aws_rds:enabled,env:${var.env},!dd_custom_rds-mysql:enabled} by {identifier,region} > 90" type = "query alert" thresholds { - warning = 0.8 - critical = 0.9 + warning = "${var.rds_cpu_threshold["warning"]}" + critical = "${var.rds_cpu_threshold["critical"]}" } - notify_no_data = false + notify_no_data = "${var.rds_config["notify_no_data"]}" + evaluation_delay = "${var.rds_config["delay"]}" renotify_interval = 60 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 + new_host_delay = "${var.rds_config["delay"]}" no_data_timeframe = 20 } resource "datadog_monitor" "mysql_rds_free_space_low" { - name = "rds mysql free space low < 10 % on {{host.name}}" + name = "[${var.env}] rds free space low < 10 % on {{host.identifier}}" message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_escalation_group}\n{{/is_warning_recovery}}" + type = "query alert" + query = "avg(last_15m): avg:aws.rds.free_storage_space{dd_monitoring:enabled,dd_aws_rds:enabled,env:${var.env},!dd_custom_rds-mysql:enabled} by {identifier,region} / avg:aws.rds.total_storage_space{dd_monitoring:enabled,dd_rds-mysql_basics:enabled,env:${var.env},!dd_custom_rds-mysql:enabled} by {identifier,region} * 100 < 10" + count = "${var.dd_aws_rds == "enabled" ? 1 : 0 }" - type = "service check" - query = "avg(last_10m): 1 - avg:aws.rds.free_storage_space{\"dd_monitoring:enabled\",\"dd_rds-mysql_basics:enabled\",\"!dd_custom_rds-mysql:enabled\"} by {host} / ( avg:aws.rds.total_storage_space{\"dd_monitoring:enabled\",\"dd_rds-mysql_basics:enabled\",\"!dd_custom_rds-mysql:enabled\"} by {host} - avg:aws.rds.free_storage_space{} by {host} ) < 0.1" - count = "${var.dd_rds_mysql_basics == "enabled" ? 1 : 0 }" thresholds { - warning = 0.2 - critical = 0.1 + warning = "${var.rds_mem_threshold["warning"]}" + critical = "${var.rds_mem_threshold["critical"]}" } - notify_no_data = false + notify_no_data = "${var.rds_config["notify_no_data"]}" + evaluation_delay = "${var.rds_config["delay"]}" renotify_interval = 60 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = 300 - renotify_interval = 0 + new_host_delay = "${var.rds_config["delay"]}" no_data_timeframe = 20 - } \ No newline at end of file diff --git a/testing/inputs.tf b/testing/inputs.tf index 55bf02e..1f174cf 100644 --- a/testing/inputs.tf +++ b/testing/inputs.tf @@ -1,3 +1,7 @@ +variable environment {} + +variable region {} + variable "critical_escalation_group" { default = "@pagerduty_HODummy" } @@ -5,22 +9,14 @@ variable "warning_escalation_group" { default = "@pagerduty_HNODummy" } -variable "datadog_app_key" {} variable "datadog_api_key" {} +variable "datadog_app_key" {} variable "dd_linux_basics" { default = "enabled" } -variable "dd_custom_cpu" { - type = "map" - default = { - status = "enabled" - name = "CPU High > 95% during 1 hour" - - period = "last_1h" - - critical_threshold = 95 - warning_threshold = 90 - } +variable "dd_aws_rds" { + default = "enabled" } + diff --git a/testing/modules.tf b/testing/modules.tf index 036cb16..5574a18 100644 --- a/testing/modules.tf +++ b/testing/modules.tf @@ -1,13 +1,15 @@ module "datadog-monitors" { source = "git::ssh://git@bitbucket.org/morea/terraform.datadog.monitors.git" + env = "${var.environment}" + region = "${var.region}" + critical_escalation_group = "${var.critical_escalation_group}" warning_escalation_group = "${var.warning_escalation_group}" - #default monitors templates integrations examples - dd_linux_basics = "${var.dd_linux_basics}" - #nginx = "false" - #aws_rds_mysql = "false" - dd_custom_cpu = "${var.dd_custom_cpu}" + dd_aws_rds = "${var.dd_aws_rds}" + dd_linux_basics = "${var.dd_linux_basics}" + } +