diff --git a/datadog-samples/inputs-declaration.sample b/datadog-samples/inputs-declaration.sample index a82030e..01e8c8d 100644 --- a/datadog-samples/inputs-declaration.sample +++ b/datadog-samples/inputs-declaration.sample @@ -1,35 +1,47 @@ -variable environment {} - -variable region {} - -variable "critical_escalation_group" { - default = "@pagerduty_HODummy" -} -variable "warning_escalation_group" { - default = "@pagerduty_HNODummy" +# Global Terraform +variable "environment" { + description = "Architecture Environment" + type = "string" } -variable "datadog_api_key" {} -variable "datadog_app_key" {} - -variable "dd_linux_basics" { - default = "enabled" +# Global DataDog +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 } -variable "dd_aws_rds" { - default = "enabled" +variable "message" { + description = "Message sent when an alert is triggered" } - -variable "dd_custom_cpu" { - type = "map" - default = { - status = "enabled" - name = "CPU High > 95% during 1 hour" - - period = "last_1h" - - critical_threshold = 95 - warning_threshold = 90 - } +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# instance specific + +variable "cpu_threshold_warning" { + description = "CPU usage in percent (warning threshold)" + default = "80" +} + +variable "cpu_threshold_critical" { + description = "CPU usage in percent (critical threshold)" + default = "90" +} + +variable "diskspace_threshold_warning" { + description = "Disk free space in percent (warning threshold)" + default = "20" +} + +variable "diskspace_threshold_critical" { + description = "Disk free space in percent (critical threshold)" + default = "10" } diff --git a/system/generic/README.md b/system/generic/README.md new file mode 100644 index 0000000..eb85357 --- /dev/null +++ b/system/generic/README.md @@ -0,0 +1,34 @@ +System Generic DataDog monitors +=============================== + +How to use this module +---------------------- + +``` +module "datadog-monitors-system-generic" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//system/generic?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* System CPU High + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| custom_cpu_period | Set up period for the query | string | `last_5m` | no | +| custom_cpu_threshold_critical | Custom CPU critical threshold | string | `95` | no | +| custom_cpu_threshold_warning | Custom CPU warning threshold | string | `80` | no | +| environment | Architecture Environment | string | - | yes | +| evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| message | Message sent when an alert is triggered | string | - | yes | diff --git a/system/generic/inputs.tf b/system/generic/inputs.tf deleted file mode 120000 index cdfc6c6..0000000 --- a/system/generic/inputs.tf +++ /dev/null @@ -1 +0,0 @@ -../../inputs.tf \ No newline at end of file diff --git a/system/generic/inputs.tf b/system/generic/inputs.tf new file mode 100644 index 0000000..06a118e --- /dev/null +++ b/system/generic/inputs.tf @@ -0,0 +1,42 @@ +# Global Terraform +variable "environment" { + description = "Architecture Environment" + type = "string" +} + +# Global DataDog +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "message" { + description = "Message sent when an alert is triggered" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# Custom CPU instance specific + +variable "custom_cpu_period" { + description = "Set up period for the query" + default = "last_5m" +} + +variable "custom_cpu_threshold_warning" { + description = "Custom CPU warning threshold" + default = 80 +} + +variable "custom_cpu_threshold_critical" { + description = "Custom CPU critical threshold" + default = 95 +} diff --git a/system/generic/monitors-custom-cpu.tf b/system/generic/monitors-custom-cpu.tf index ef2d695..57b9b03 100644 --- a/system/generic/monitors-custom-cpu.tf +++ b/system/generic/monitors-custom-cpu.tf @@ -1,19 +1,32 @@ -resource "datadog_monitor" "cpu_custom" { - name = "${var.dd_custom_cpu["name"]}" - message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" - count = "${var.dd_custom_cpu["status"] == "enabled" ? 1 : 0}" +data "template_file" "filter" { + template = "$${filter}" - query = "min(${var.dd_custom_cpu["period"]}):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu.monitoring:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > ${var.dd_custom_cpu["critical_threshold"]}" - type = "query alert" + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_rds:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "cpu_custom" { + name = "[${var.environment}] CPU too High {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + message = "${var.message}" + + query = < ${var.custom_cpu_threshold_critical}" + EOF + + type = "metric alert" thresholds = { - warning = "${var.dd_custom_cpu["warning_threshold"]}" - critical = "${var.dd_custom_cpu["critical_threshold"]}" + warning = "${var.custom_cpu_threshold_warning}" + critical = "${var.custom_cpu_threshold_critical}" } - notify_no_data = "${var.linux_basics_config["notify_no_data"]}" - evaluation_delay = "${var.linux_basics_config["delay"]}" - new_host_delay = "${var.linux_basics_config["delay"]}" + notify_no_data = true + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.evaluation_delay}" renotify_interval = 60 notify_audit = false timeout_h = 0 diff --git a/system/linux/README.md b/system/linux/README.md new file mode 100644 index 0000000..54aac37 --- /dev/null +++ b/system/linux/README.md @@ -0,0 +1,43 @@ +System Linux DataDog monitors +============================= + +How to use this module +---------------------- + +``` +module "datadog-monitors-system-generic" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//system/linux?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* System CPU High +* System Free disk space +* System Free disk inodes +* System Free memory + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| cpu_high_threshold_critical | CPU high critical threshold | string | `95` | no | +| cpu_high_threshold_warning | CPU high warning threshold | string | `80` | no | +| custom_cpu_period | Set up period for the query | string | `last_5m` | no | +| environment | Architecture Environment | string | - | yes | +| evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| free_disk_inodes_threshold_critical | Free disk space critical threshold | string | `5` | no | +| free_disk_inodes_threshold_warning | Free disk space warning threshold | string | `10` | no | +| free_disk_space_threshold_critical | Free disk space critical threshold | string | `5` | no | +| free_disk_space_threshold_warning | Free disk space warning threshold | string | `10` | no | +| free_memory_threshold_critical | Free disk space critical threshold | string | `5` | no | +| free_memory_threshold_warning | Free disk space warning threshold | string | `10` | no | +| message | Message sent when an alert is triggered | string | - | yes | \ No newline at end of file diff --git a/system/linux/inputs.tf b/system/linux/inputs.tf deleted file mode 120000 index cdfc6c6..0000000 --- a/system/linux/inputs.tf +++ /dev/null @@ -1 +0,0 @@ -../../inputs.tf \ No newline at end of file diff --git a/system/linux/inputs.tf b/system/linux/inputs.tf new file mode 100644 index 0000000..de67079 --- /dev/null +++ b/system/linux/inputs.tf @@ -0,0 +1,72 @@ +# Global Terraform +variable "environment" { + description = "Architecture Environment" + type = "string" +} + +# Global DataDog +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "message" { + description = "Message sent when an alert is triggered" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# Custom CPU instance specific + +variable "custom_cpu_period" { + description = "Set up period for the query" + default = "last_5m" +} + +variable "cpu_high_threshold_warning" { + description = "CPU high warning threshold" + default = 80 +} + +variable "cpu_high_threshold_critical" { + description = "CPU high critical threshold" + default = 95 +} + +variable "free_disk_space_threshold_warning" { + description = "Free disk space warning threshold" + default = 10 +} + +variable "free_disk_space_threshold_critical" { + description = "Free disk space critical threshold" + default = 5 +} + +variable "free_disk_inodes_threshold_warning" { + description = "Free disk space warning threshold" + default = 10 +} + +variable "free_disk_inodes_threshold_critical" { + description = "Free disk space critical threshold" + default = 5 +} + +variable "free_memory_threshold_warning" { + description = "Free disk space warning threshold" + default = 10 +} + +variable "free_memory_threshold_critical" { + description = "Free disk space critical threshold" + default = 5 +} diff --git a/system/linux/monitors-linux-basics.tf b/system/linux/monitors-linux-basics.tf index 6260aed..6bd173d 100644 --- a/system/linux/monitors-linux-basics.tf +++ b/system/linux/monitors-linux-basics.tf @@ -1,21 +1,34 @@ -resource "datadog_monitor" "cpu_80_15min" { - name = "[${var.env}] CPU High > ${var.cpu_15_critical} for 15 min on {{host.name}}" - message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" - count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" +data "template_file" "filter" { + template = "$${filter}" - query = "min(last_15m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region} > ${var.cpu_15_critical}" - type = "query alert" + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_rds:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "datadog_cpu_too_high" { + name = "[${var.environment}] CPU High {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + message = "${var.message}" + + query = < ${var.cpu_high_threshold_critical} + EOF + + type = "metric alert" thresholds { - critical = "${var.cpu_15_critical}" + warning = "${var.cpu_high_threshold_warning}" + critical = "${var.cpu_high_threshold_critical}" } - tags = ["env:${var.env}", "type:system"] + tags = ["env:${var.environment}", "type:system"] - notify_no_data = "${var.linux_basics_config["notify_no_data"]}" - evaluation_delay = "${var.linux_basics_config["delay"]}" - new_host_delay = "${var.linux_basics_config["delay"]}" - renotify_interval = 60 + notify_no_data = true + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.evaluation_delay}" notify_audit = false timeout_h = 0 include_tags = true @@ -24,24 +37,29 @@ resource "datadog_monitor" "cpu_80_15min" { no_data_timeframe = 20 } -resource "datadog_monitor" "cpu_95_5min" { - name = "[${var.env}] CPU High > ${var.cpu_5_critical} for 5 min on {{host.name}}" - message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" +resource "datadog_monitor" "datadog_free_disk_space_too_low" { + name = "[${var.environment}] Free disk space {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + message = "${var.message}" - query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region} > ${var.cpu_5_critical}" - type = "query alert" - count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" + query = <