diff --git a/cloud/aws/elasticsearch/monitors-elasticsearch.tf b/cloud/aws/elasticsearch/monitors-elasticsearch.tf index 7ba3aa2..b665926 100644 --- a/cloud/aws/elasticsearch/monitors-elasticsearch.tf +++ b/cloud/aws/elasticsearch/monitors-elasticsearch.tf @@ -33,7 +33,7 @@ EOF timeout_h = 0 include_tags = true locked = false - require_full_window = true + require_full_window = false new_host_delay = "${var.evaluation_delay}" no_data_timeframe = 20 @@ -42,14 +42,15 @@ EOF ### Elasticsearch cluster free storage space monitor ### resource "datadog_monitor" "es_free_space_low" { - name = "[${var.environment}] ElasticSearch cluster free storage space < ${var.diskspace_threshold_critical}%" + name = "[${var.environment}] ElasticSearch cluster free storage space {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" message = "${var.message}" type = "query alert" query = < ${var.cpu_threshold_critical} +EOF + + thresholds { + warning = "${var.cpu_threshold_warning}" + critical = "${var.cpu_threshold_critical}" + } + + notify_no_data = true + evaluation_delay = "${var.evaluation_delay}" + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = false + new_host_delay = "${var.evaluation_delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:rds", "team:aws", "provider:aws"] +} + +### RDS instance free space monitor ### +resource "datadog_monitor" "rds_free_space_low" { + name = "[${var.environment}] RDS instance free space {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + message = "${var.message}" + + type = "metric alert" + + query = < ${var.php_fpm_busy_threshold_critical} + EOF thresholds { - warning = "${var.php_fpm_busy_threshold["warning"]}" - critical = "${var.php_fpm_busy_threshold["critical"]}" + warning = "${var.php_fpm_busy_threshold_warning}" + critical = "${var.php_fpm_busy_threshold_critical}" } - notify_no_data = "${var.apache_nginx_fpm_config["notify_no_data"]}" - evaluation_delay = "${var.apache_nginx_fpm_config["delay"]}" - new_host_delay = "${var.apache_nginx_fpm_config["delay"]}" - renotify_interval = 60 + notify_no_data = true + evaluation_delay = "${var.evaluation_delay_metric}" + new_host_delay = "${var.evaluation_delay_metric}" notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - renotify_interval = 0 no_data_timeframe = 20 - tags = ["*"] + tags = ["env:${var.environment}", "resource:php-fpm"] } -resource "datadog_monitor" "FPM_process" { - name = "[${var.env}] FPM process is down on {{host.name}}" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" +resource "datadog_monitor" "datadog_fpm_process" { + name = "[${var.environment}] Can't connect to php-fpm" + message = "${var.message}" type = "service check" - query = "\"process.up\".over(\"dd_monitoring:enabled\",\"dd_php_fpm:enabled\",\"process:php_fpm\",\"env:${var.env}\").by(\"host\",\"process\", \"app\").last(4).count_by_status()" - count = "${var.dd_nginx == "enabled" ? 1 : 0 }" + query = "\"php_fpm.can_ping\".over(\"dd_monitoring:enabled\",\"dd_php_fpm:enabled\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" thresholds = { ok = 1 @@ -40,10 +51,10 @@ resource "datadog_monitor" "FPM_process" { critical = 4 } - notify_no_data = "${var.apache_nginx_fpm_config["notify_no_data"]}" - evaluation_delay = "${var.apache_nginx_fpm_config["delay"]}" - new_host_delay = "${var.apache_nginx_fpm_config["delay"]}" - renotify_interval = 60 + notify_no_data = true + evaluation_delay = "${var.evaluation_delay_service}" + new_host_delay = "${var.evaluation_delay_service}" + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -51,5 +62,5 @@ resource "datadog_monitor" "FPM_process" { require_full_window = true no_data_timeframe = 20 - tags = ["*"] + tags = ["env:${var.environment}", "resource:php-fpm"] } diff --git a/system/generic/README.md b/system/generic/README.md new file mode 100644 index 0000000..abf80e0 --- /dev/null +++ b/system/generic/README.md @@ -0,0 +1,40 @@ +System Generic DataDog monitors +=============================== + +How to use this module +---------------------- + +``` +module "datadog-monitors-system-generic" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//system/generic?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* System CPU High + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| cpu_high_threshold_critical | CPU high critical threshold | string | `95` | no | +| cpu_high_threshold_warning | CPU high warning threshold | string | `80` | no | +| cpu_high_timeframe | CPU high timeframe | string | `last_5m` | no | +| environment | Architecture Environment | string | - | yes | +| evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| free_disk_inodes_threshold_critical | Free disk space critical threshold | string | `5` | no | +| free_disk_inodes_threshold_warning | Free disk space warning threshold | string | `10` | no | +| free_disk_space_threshold_critical | Free disk space critical threshold | string | `5` | no | +| free_disk_space_threshold_warning | Free disk space warning threshold | string | `10` | no | +| free_memory_threshold_critical | Free disk space critical threshold | string | `5` | no | +| free_memory_threshold_warning | Free disk space warning threshold | string | `10` | no | +| message | Message sent when an alert is triggered | string | - | yes | diff --git a/system/generic/inputs.tf b/system/generic/inputs.tf deleted file mode 120000 index cdfc6c6..0000000 --- a/system/generic/inputs.tf +++ /dev/null @@ -1 +0,0 @@ -../../inputs.tf \ No newline at end of file diff --git a/system/generic/inputs.tf b/system/generic/inputs.tf new file mode 100644 index 0000000..5b34296 --- /dev/null +++ b/system/generic/inputs.tf @@ -0,0 +1,72 @@ +# Global Terraform +variable "environment" { + description = "Architecture Environment" + type = "string" +} + +# Global DataDog +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "message" { + description = "Message sent when an alert is triggered" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# Custom CPU instance specific + +variable "cpu_high_timeframe" { + description = "CPU high timeframe" + default = "last_5m" +} + +variable "cpu_high_threshold_warning" { + description = "CPU high warning threshold" + default = 80 +} + +variable "cpu_high_threshold_critical" { + description = "CPU high critical threshold" + default = 95 +} + +variable "free_disk_space_threshold_warning" { + description = "Free disk space warning threshold" + default = 10 +} + +variable "free_disk_space_threshold_critical" { + description = "Free disk space critical threshold" + default = 5 +} + +variable "free_disk_inodes_threshold_warning" { + description = "Free disk space warning threshold" + default = 10 +} + +variable "free_disk_inodes_threshold_critical" { + description = "Free disk space critical threshold" + default = 5 +} + +variable "free_memory_threshold_warning" { + description = "Free disk space warning threshold" + default = 10 +} + +variable "free_memory_threshold_critical" { + description = "Free disk space critical threshold" + default = 5 +} diff --git a/system/generic/monitors-custom-cpu.tf b/system/generic/monitors-custom-cpu.tf deleted file mode 100644 index ef2d695..0000000 --- a/system/generic/monitors-custom-cpu.tf +++ /dev/null @@ -1,24 +0,0 @@ -resource "datadog_monitor" "cpu_custom" { - name = "${var.dd_custom_cpu["name"]}" - message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" - count = "${var.dd_custom_cpu["status"] == "enabled" ? 1 : 0}" - - query = "min(${var.dd_custom_cpu["period"]}):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu.monitoring:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > ${var.dd_custom_cpu["critical_threshold"]}" - type = "query alert" - - thresholds = { - warning = "${var.dd_custom_cpu["warning_threshold"]}" - critical = "${var.dd_custom_cpu["critical_threshold"]}" - } - - notify_no_data = "${var.linux_basics_config["notify_no_data"]}" - evaluation_delay = "${var.linux_basics_config["delay"]}" - new_host_delay = "${var.linux_basics_config["delay"]}" - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - no_data_timeframe = 20 -} diff --git a/system/generic/monitors-system.tf b/system/generic/monitors-system.tf new file mode 100644 index 0000000..0a09ffd --- /dev/null +++ b/system/generic/monitors-system.tf @@ -0,0 +1,131 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_system:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "datadog_cpu_too_high" { + name = "[${var.environment}] CPU usage {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + message = "${var.message}" + + query = < ${var.cpu_high_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.cpu_high_threshold_warning}" + critical = "${var.cpu_high_threshold_critical}" + } + + tags = ["env:${var.environment}", "type:system", "resource:cpu"] + + notify_no_data = true + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.evaluation_delay}" + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + no_data_timeframe = 20 +} + +resource "datadog_monitor" "datadog_free_disk_space_too_low" { + name = "[${var.environment}] Free disk space {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + message = "${var.message}" + + query = <