From 92833feff788d16b3831623690d676b88448e006 Mon Sep 17 00:00:00 2001 From: Ahmed Fourti Date: Wed, 13 Dec 2017 12:11:15 +0100 Subject: [PATCH 01/11] MON-96 Update non-cloud resource tags --- middleware/apache/monitors-apache.tf | 2 +- middleware/nginx/monitors-nginx.tf | 2 +- middleware/php-fpm/monitors-fpm.tf | 4 ++-- system/linux/monitors-linux-basics.tf | 14 +++++++------- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/middleware/apache/monitors-apache.tf b/middleware/apache/monitors-apache.tf index 49b31d4..b542aa4 100644 --- a/middleware/apache/monitors-apache.tf +++ b/middleware/apache/monitors-apache.tf @@ -23,5 +23,5 @@ resource "datadog_monitor" "Apache_process" { require_full_window = true no_data_timeframe = 20 - tags = ["*"] + tags = ["env:${var.env}", "type:apache"] } diff --git a/middleware/nginx/monitors-nginx.tf b/middleware/nginx/monitors-nginx.tf index e09c010..03595bf 100644 --- a/middleware/nginx/monitors-nginx.tf +++ b/middleware/nginx/monitors-nginx.tf @@ -23,5 +23,5 @@ resource "datadog_monitor" "Nginx_process" { require_full_window = true no_data_timeframe = 20 - tags = ["*"] + tags = ["env:${var.env}", "type:nginx"] } diff --git a/middleware/php-fpm/monitors-fpm.tf b/middleware/php-fpm/monitors-fpm.tf index 97e7ff8..8bf654e 100644 --- a/middleware/php-fpm/monitors-fpm.tf +++ b/middleware/php-fpm/monitors-fpm.tf @@ -23,7 +23,7 @@ resource "datadog_monitor" "php-fpm_process_idle" { renotify_interval = 0 no_data_timeframe = 20 - tags = ["*"] + tags = ["env:${var.env}", "type:php-fpm"] } resource "datadog_monitor" "FPM_process" { @@ -51,5 +51,5 @@ resource "datadog_monitor" "FPM_process" { require_full_window = true no_data_timeframe = 20 - tags = ["*"] + tags = ["env:${var.env}", "type:php-fpm"] } diff --git a/system/linux/monitors-linux-basics.tf b/system/linux/monitors-linux-basics.tf index 459122d..6260aed 100644 --- a/system/linux/monitors-linux-basics.tf +++ b/system/linux/monitors-linux-basics.tf @@ -10,7 +10,7 @@ resource "datadog_monitor" "cpu_80_15min" { critical = "${var.cpu_15_critical}" } - tags = ["*"] + tags = ["env:${var.env}", "type:system"] notify_no_data = "${var.linux_basics_config["notify_no_data"]}" evaluation_delay = "${var.linux_basics_config["delay"]}" @@ -36,7 +36,7 @@ resource "datadog_monitor" "cpu_95_5min" { critical = "${var.cpu_5_critical}" } - tags = ["*"] + tags = ["env:${var.env}", "type:system"] notify_no_data = "${var.linux_basics_config["notify_no_data"]}" evaluation_delay = "${var.linux_basics_config["delay"]}" @@ -62,7 +62,7 @@ resource "datadog_monitor" "datadog_free_disk_space_5" { critical = 5 } - tags = ["*"] + tags = ["env:${var.env}", "type:system"] notify_no_data = "${var.linux_basics_config["notify_no_data"]}" evaluation_delay = "${var.linux_basics_config["delay"]}" @@ -89,7 +89,7 @@ resource "datadog_monitor" "datadog_free_disk_space_10" { critical = 10 } - tags = ["*"] + tags = ["env:${var.env}", "type:system"] notify_no_data = "${var.linux_basics_config["notify_no_data"]}" evaluation_delay = "${var.linux_basics_config["delay"]}" @@ -115,7 +115,7 @@ resource "datadog_monitor" "datadog_free_disk_space_inodes_5" { critical = 5 } - tags = ["*"] + tags = ["env:${var.env}", "type:system"] notify_no_data = "${var.linux_basics_config["notify_no_data"]}" evaluation_delay = "${var.linux_basics_config["delay"]}" @@ -142,7 +142,7 @@ resource "datadog_monitor" "datadog_free_disk_space_inodes_10" { critical = 10 } - tags = ["*"] + tags = ["env:${var.env}", "type:system"] notify_no_data = "${var.linux_basics_config["notify_no_data"]}" evaluation_delay = "${var.linux_basics_config["delay"]}" @@ -188,7 +188,7 @@ resource "datadog_monitor" "datadog_free_memory" { critical = 5 } - tags = ["*"] + tags = ["env:${var.env}", "type:system"] notify_no_data = "${var.linux_basics_config["notify_no_data"]}" evaluation_delay = "${var.linux_basics_config["delay"]}" From 86b999d7f328296f82500c0ae07f96e6570c0c9d Mon Sep 17 00:00:00 2001 From: Guillaume Kerivel Date: Tue, 16 Jan 2018 16:08:45 +0100 Subject: [PATCH 02/11] MON-96 Update basics RDS monitors --- cloud/aws/rds-mysql/inputs.tf | 1 - .../rds-mysql/monitors-rds_mysql-basics.tf | 54 --------------- cloud/aws/rds/README.md | 45 ++++++++++++ cloud/aws/rds/inputs.tf | 57 +++++++++++++++ cloud/aws/rds/monitors-rds-basics.tf | 69 +++++++++++++++++++ 5 files changed, 171 insertions(+), 55 deletions(-) delete mode 120000 cloud/aws/rds-mysql/inputs.tf delete mode 100644 cloud/aws/rds-mysql/monitors-rds_mysql-basics.tf create mode 100644 cloud/aws/rds/README.md create mode 100644 cloud/aws/rds/inputs.tf create mode 100644 cloud/aws/rds/monitors-rds-basics.tf diff --git a/cloud/aws/rds-mysql/inputs.tf b/cloud/aws/rds-mysql/inputs.tf deleted file mode 120000 index a68ace3..0000000 --- a/cloud/aws/rds-mysql/inputs.tf +++ /dev/null @@ -1 +0,0 @@ -../../../inputs.tf \ No newline at end of file diff --git a/cloud/aws/rds-mysql/monitors-rds_mysql-basics.tf b/cloud/aws/rds-mysql/monitors-rds_mysql-basics.tf deleted file mode 100644 index 87e0746..0000000 --- a/cloud/aws/rds-mysql/monitors-rds_mysql-basics.tf +++ /dev/null @@ -1,54 +0,0 @@ -resource "datadog_monitor" "rds-mysql_cpu_80_15min" { - name = "[${var.env}] rds Cpu high > 90% for 15 min on {{host.identifier}}" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" - - count = "${var.dd_aws_rds == "enabled" ? 1 : 0 }" - - query = "avg(last_15m):avg:aws.rds.cpuutilization{dd_monitoring:enabled,dd_aws_rds:enabled,env:${var.env}} by {region,name} > 90" - type = "query alert" - - thresholds { - warning = "${var.rds_cpu_threshold["warning"]}" - critical = "${var.rds_cpu_threshold["critical"]}" - } - - tags = ["*"] - - notify_no_data = "${var.rds_config["notify_no_data"]}" - evaluation_delay = "${var.rds_config["delay"]}" - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = "${var.rds_config["delay"]}" - no_data_timeframe = 20 -} - -resource "datadog_monitor" "mysql_rds_free_space_low" { - name = "[${var.env}] rds free space low < 10 % on {{host.identifier}}" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" - - type = "query alert" - query = "avg(last_15m):avg:aws.rds.free_storage_space{dd_monitoring:enabled,dd_aws_rds:enabled,env:${var.env}} by {region,name} / avg:aws.rds.total_storage_space{dd_monitoring:enabled,dd_aws_rds:enabled,env:${var.env}} by {region,name} * 100 < 10" - count = "${var.dd_aws_rds == "enabled" ? 1 : 0 }" - - thresholds { - warning = "${var.rds_mem_threshold["warning"]}" - critical = "${var.rds_mem_threshold["critical"]}" - } - - tags = ["*"] - - notify_no_data = "${var.rds_config["notify_no_data"]}" - evaluation_delay = "${var.rds_config["delay"]}" - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = "${var.rds_config["delay"]}" - no_data_timeframe = 20 -} diff --git a/cloud/aws/rds/README.md b/cloud/aws/rds/README.md new file mode 100644 index 0000000..36e8780 --- /dev/null +++ b/cloud/aws/rds/README.md @@ -0,0 +1,45 @@ +AWS RDS Instance DataDog monitors +================================= + +How to use this module +---------------------- + +``` +module "datadog-monitors-aws-rds" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/aws/rds?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* CPU High +* Free disk space low + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | +| cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no | +| diskspace_threshold_critical | Disk free space in percent (critical threshold) | string | `10` | no | +| diskspace_threshold_warning | Disk free space in percent (warning threshold) | string | `20` | no | +| environment | Architecture Environment | string | - | yes | +| evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| message | Message sent when an alert is triggered | string | - | yes | +| notify_no_data | Enable 'No Data' alert | string | `true` | no | +| renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status | string | `60` | no | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/amazon_rds/](https://docs.datadoghq.com/integrations/amazon_rds/) + +AWS RDS Instance metrics documentation: [https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/rds-metricscollected.html](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/rds-metricscollected.html) diff --git a/cloud/aws/rds/inputs.tf b/cloud/aws/rds/inputs.tf new file mode 100644 index 0000000..8e6d89a --- /dev/null +++ b/cloud/aws/rds/inputs.tf @@ -0,0 +1,57 @@ +# Global Terraform +variable "environment" { + description = "Architecture Environment" + type = "string" +} + +# Global DataDog +variable "notify_no_data" { + description = "Enable 'No Data' alert" + default = true +} + +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "renotify_interval" { + description = "The number of minutes after the last notification before a monitor will re-notify on the current status" + default = 60 +} + +variable "message" { + description = "Message sent when an alert is triggered" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# AWS RDS instance specific + +variable "cpu_threshold_warning" { + description = "CPU usage in percent (warning threshold)" + default = "80" +} + +variable "cpu_threshold_critical" { + description = "CPU usage in percent (critical threshold)" + default = "90" +} + +variable "diskspace_threshold_warning" { + description = "Disk free space in percent (warning threshold)" + default = "20" +} + +variable "diskspace_threshold_critical" { + description = "Disk free space in percent (critical threshold)" + default = "10" +} diff --git a/cloud/aws/rds/monitors-rds-basics.tf b/cloud/aws/rds/monitors-rds-basics.tf new file mode 100644 index 0000000..bb83b46 --- /dev/null +++ b/cloud/aws/rds/monitors-rds-basics.tf @@ -0,0 +1,69 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_rds:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +### RDS instance CPU monitor ### +resource "datadog_monitor" "rds_cpu_90_15min" { + name = "[${var.environment}] RDS instance CPU high > ${var.cpu_threshold_critical}% for 15 min on {{host.identifier}}" + message = "${var.message}" + + type = "metric alert" + query = < ${var.cpu_threshold_critical} +EOF + + thresholds { + warning = "${var.cpu_threshold_warning}" + critical = "${var.cpu_threshold_critical}" + } + + notify_no_data = "${var.notify_no_data}" + evaluation_delay = "${var.evaluation_delay}" + renotify_interval = "${var.renotify_interval}" + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.evaluation_delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:rds", "team:aws", "provider:aws"] +} + +### RDS instance free space monitor ### +resource "datadog_monitor" "rds_free_space_low" { + name = "[${var.environment}] RDS instance free space < ${var.diskspace_threshold_critical}% on {{host.identifier}}" + message = "${var.message}" + + type = "metric alert" + query = < Date: Wed, 21 Feb 2018 16:10:51 +0100 Subject: [PATCH 03/11] MON-96 - Rename file and terraform fmt --- cloud/aws/rds/{monitors-rds-basics.tf => monitors-rds.tf} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cloud/aws/rds/{monitors-rds-basics.tf => monitors-rds.tf} (100%) diff --git a/cloud/aws/rds/monitors-rds-basics.tf b/cloud/aws/rds/monitors-rds.tf similarity index 100% rename from cloud/aws/rds/monitors-rds-basics.tf rename to cloud/aws/rds/monitors-rds.tf From 8f9bf61d19d36c76c82617f5fa68c6ca5935e272 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Wed, 21 Feb 2018 16:18:07 +0100 Subject: [PATCH 04/11] MON-96 - Apply best practice and recommandations --- cloud/aws/rds/README.md | 2 -- cloud/aws/rds/inputs.tf | 10 ---------- cloud/aws/rds/monitors-rds.tf | 19 ++++++++++--------- 3 files changed, 10 insertions(+), 21 deletions(-) diff --git a/cloud/aws/rds/README.md b/cloud/aws/rds/README.md index 36e8780..b6337a1 100644 --- a/cloud/aws/rds/README.md +++ b/cloud/aws/rds/README.md @@ -34,8 +34,6 @@ Inputs | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when an alert is triggered | string | - | yes | -| notify_no_data | Enable 'No Data' alert | string | `true` | no | -| renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status | string | `60` | no | Related documentation --------------------- diff --git a/cloud/aws/rds/inputs.tf b/cloud/aws/rds/inputs.tf index 8e6d89a..cac076f 100644 --- a/cloud/aws/rds/inputs.tf +++ b/cloud/aws/rds/inputs.tf @@ -5,21 +5,11 @@ variable "environment" { } # Global DataDog -variable "notify_no_data" { - description = "Enable 'No Data' alert" - default = true -} - variable "evaluation_delay" { description = "Delay in seconds for the metric evaluation" default = 600 } -variable "renotify_interval" { - description = "The number of minutes after the last notification before a monitor will re-notify on the current status" - default = 60 -} - variable "message" { description = "Message sent when an alert is triggered" } diff --git a/cloud/aws/rds/monitors-rds.tf b/cloud/aws/rds/monitors-rds.tf index bb83b46..dffb178 100644 --- a/cloud/aws/rds/monitors-rds.tf +++ b/cloud/aws/rds/monitors-rds.tf @@ -8,10 +8,11 @@ data "template_file" "filter" { ### RDS instance CPU monitor ### resource "datadog_monitor" "rds_cpu_90_15min" { - name = "[${var.environment}] RDS instance CPU high > ${var.cpu_threshold_critical}% for 15 min on {{host.identifier}}" + name = "[${var.environment}] RDS instance CPU high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" message = "${var.message}" - type = "metric alert" + type = "metric alert" + query = < Date: Wed, 21 Feb 2018 16:55:06 +0100 Subject: [PATCH 05/11] MON-96 - Apply conventions and update middleware directory --- .../elasticsearch/monitors-elasticsearch.tf | 7 ++- middleware/apache/README.md | 30 ++++++++++ middleware/apache/inputs.tf | 22 +++++++- middleware/apache/monitors-apache.tf | 15 +++-- middleware/nginx/README.md | 30 ++++++++++ middleware/nginx/inputs.tf | 22 +++++++- middleware/nginx/monitors-nginx.tf | 15 +++-- middleware/php-fpm/README.md | 35 ++++++++++++ middleware/php-fpm/inputs.tf | 43 ++++++++++++++- middleware/php-fpm/monitors-fpm.tf | 55 +++++++++++-------- 10 files changed, 230 insertions(+), 44 deletions(-) create mode 100644 middleware/apache/README.md mode change 120000 => 100644 middleware/apache/inputs.tf create mode 100644 middleware/nginx/README.md mode change 120000 => 100644 middleware/nginx/inputs.tf create mode 100644 middleware/php-fpm/README.md mode change 120000 => 100644 middleware/php-fpm/inputs.tf diff --git a/cloud/aws/elasticsearch/monitors-elasticsearch.tf b/cloud/aws/elasticsearch/monitors-elasticsearch.tf index 7ba3aa2..8880a02 100644 --- a/cloud/aws/elasticsearch/monitors-elasticsearch.tf +++ b/cloud/aws/elasticsearch/monitors-elasticsearch.tf @@ -42,14 +42,15 @@ EOF ### Elasticsearch cluster free storage space monitor ### resource "datadog_monitor" "es_free_space_low" { - name = "[${var.environment}] ElasticSearch cluster free storage space < ${var.diskspace_threshold_critical}%" + name = "[${var.environment}] ElasticSearch cluster free storage space {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" message = "${var.message}" type = "query alert" query = < ${var.php_fpm_busy_threshold_critical} + EOF thresholds { - warning = "${var.php_fpm_busy_threshold["warning"]}" - critical = "${var.php_fpm_busy_threshold["critical"]}" + warning = "${var.php_fpm_busy_threshold_warning}" + critical = "${var.php_fpm_busy_threshold_critical}" } - notify_no_data = "${var.apache_nginx_fpm_config["notify_no_data"]}" - evaluation_delay = "${var.apache_nginx_fpm_config["delay"]}" - new_host_delay = "${var.apache_nginx_fpm_config["delay"]}" - renotify_interval = 60 + notify_no_data = false + evaluation_delay = "${var.evaluation_delay_metric}" + new_host_delay = "${var.evaluation_delay_metric}" notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - renotify_interval = 0 no_data_timeframe = 20 - tags = ["env:${var.env}", "type:php-fpm"] + tags = ["env:${var.environment}", "type:php-fpm"] } resource "datadog_monitor" "FPM_process" { - name = "[${var.env}] FPM process is down on {{host.name}}" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" + name = "[${var.environment}] FPM process is down on {{host.name}}" + message = "${var.message}" type = "service check" - query = "\"process.up\".over(\"dd_monitoring:enabled\",\"dd_php_fpm:enabled\",\"process:php_fpm\",\"env:${var.env}\").by(\"host\",\"process\", \"app\").last(4).count_by_status()" - count = "${var.dd_nginx == "enabled" ? 1 : 0 }" + query = "\"process.up\".over(\"dd_monitoring:enabled\",\"dd_php_fpm:enabled\",\"process:php_fpm\",\"env:${var.environment}\").by(\"host\",\"process\", \"app\").last(4).count_by_status()" thresholds = { ok = 1 @@ -40,9 +51,9 @@ resource "datadog_monitor" "FPM_process" { critical = 4 } - notify_no_data = "${var.apache_nginx_fpm_config["notify_no_data"]}" - evaluation_delay = "${var.apache_nginx_fpm_config["delay"]}" - new_host_delay = "${var.apache_nginx_fpm_config["delay"]}" + notify_no_data = true + evaluation_delay = "${var.evaluation_delay_service}" + new_host_delay = "${var.evaluation_delay_service}" renotify_interval = 60 notify_audit = false timeout_h = 0 @@ -51,5 +62,5 @@ resource "datadog_monitor" "FPM_process" { require_full_window = true no_data_timeframe = 20 - tags = ["env:${var.env}", "type:php-fpm"] + tags = ["env:${var.environment}", "type:php-fpm"] } From dd729b90e5dbd3d28b4dd56f696e84dbc3077836 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Wed, 21 Feb 2018 17:34:17 +0100 Subject: [PATCH 06/11] MON-96 - Update system directory and update inputs-declaration sample --- datadog-samples/inputs-declaration.sample | 68 +++++--- system/generic/README.md | 34 ++++ system/generic/inputs.tf | 43 ++++- system/generic/monitors-custom-cpu.tf | 35 ++-- system/linux/README.md | 43 +++++ system/linux/inputs.tf | 73 +++++++- system/linux/monitors-linux-basics.tf | 204 +++++++++------------- 7 files changed, 333 insertions(+), 167 deletions(-) create mode 100644 system/generic/README.md mode change 120000 => 100644 system/generic/inputs.tf create mode 100644 system/linux/README.md mode change 120000 => 100644 system/linux/inputs.tf diff --git a/datadog-samples/inputs-declaration.sample b/datadog-samples/inputs-declaration.sample index a82030e..01e8c8d 100644 --- a/datadog-samples/inputs-declaration.sample +++ b/datadog-samples/inputs-declaration.sample @@ -1,35 +1,47 @@ -variable environment {} - -variable region {} - -variable "critical_escalation_group" { - default = "@pagerduty_HODummy" -} -variable "warning_escalation_group" { - default = "@pagerduty_HNODummy" +# Global Terraform +variable "environment" { + description = "Architecture Environment" + type = "string" } -variable "datadog_api_key" {} -variable "datadog_app_key" {} - -variable "dd_linux_basics" { - default = "enabled" +# Global DataDog +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 } -variable "dd_aws_rds" { - default = "enabled" +variable "message" { + description = "Message sent when an alert is triggered" } - -variable "dd_custom_cpu" { - type = "map" - default = { - status = "enabled" - name = "CPU High > 95% during 1 hour" - - period = "last_1h" - - critical_threshold = 95 - warning_threshold = 90 - } +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# instance specific + +variable "cpu_threshold_warning" { + description = "CPU usage in percent (warning threshold)" + default = "80" +} + +variable "cpu_threshold_critical" { + description = "CPU usage in percent (critical threshold)" + default = "90" +} + +variable "diskspace_threshold_warning" { + description = "Disk free space in percent (warning threshold)" + default = "20" +} + +variable "diskspace_threshold_critical" { + description = "Disk free space in percent (critical threshold)" + default = "10" } diff --git a/system/generic/README.md b/system/generic/README.md new file mode 100644 index 0000000..eb85357 --- /dev/null +++ b/system/generic/README.md @@ -0,0 +1,34 @@ +System Generic DataDog monitors +=============================== + +How to use this module +---------------------- + +``` +module "datadog-monitors-system-generic" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//system/generic?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* System CPU High + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| custom_cpu_period | Set up period for the query | string | `last_5m` | no | +| custom_cpu_threshold_critical | Custom CPU critical threshold | string | `95` | no | +| custom_cpu_threshold_warning | Custom CPU warning threshold | string | `80` | no | +| environment | Architecture Environment | string | - | yes | +| evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| message | Message sent when an alert is triggered | string | - | yes | diff --git a/system/generic/inputs.tf b/system/generic/inputs.tf deleted file mode 120000 index cdfc6c6..0000000 --- a/system/generic/inputs.tf +++ /dev/null @@ -1 +0,0 @@ -../../inputs.tf \ No newline at end of file diff --git a/system/generic/inputs.tf b/system/generic/inputs.tf new file mode 100644 index 0000000..06a118e --- /dev/null +++ b/system/generic/inputs.tf @@ -0,0 +1,42 @@ +# Global Terraform +variable "environment" { + description = "Architecture Environment" + type = "string" +} + +# Global DataDog +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "message" { + description = "Message sent when an alert is triggered" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# Custom CPU instance specific + +variable "custom_cpu_period" { + description = "Set up period for the query" + default = "last_5m" +} + +variable "custom_cpu_threshold_warning" { + description = "Custom CPU warning threshold" + default = 80 +} + +variable "custom_cpu_threshold_critical" { + description = "Custom CPU critical threshold" + default = 95 +} diff --git a/system/generic/monitors-custom-cpu.tf b/system/generic/monitors-custom-cpu.tf index ef2d695..57b9b03 100644 --- a/system/generic/monitors-custom-cpu.tf +++ b/system/generic/monitors-custom-cpu.tf @@ -1,19 +1,32 @@ -resource "datadog_monitor" "cpu_custom" { - name = "${var.dd_custom_cpu["name"]}" - message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" - count = "${var.dd_custom_cpu["status"] == "enabled" ? 1 : 0}" +data "template_file" "filter" { + template = "$${filter}" - query = "min(${var.dd_custom_cpu["period"]}):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu.monitoring:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > ${var.dd_custom_cpu["critical_threshold"]}" - type = "query alert" + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_rds:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "cpu_custom" { + name = "[${var.environment}] CPU too High {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + message = "${var.message}" + + query = < ${var.custom_cpu_threshold_critical}" + EOF + + type = "metric alert" thresholds = { - warning = "${var.dd_custom_cpu["warning_threshold"]}" - critical = "${var.dd_custom_cpu["critical_threshold"]}" + warning = "${var.custom_cpu_threshold_warning}" + critical = "${var.custom_cpu_threshold_critical}" } - notify_no_data = "${var.linux_basics_config["notify_no_data"]}" - evaluation_delay = "${var.linux_basics_config["delay"]}" - new_host_delay = "${var.linux_basics_config["delay"]}" + notify_no_data = true + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.evaluation_delay}" renotify_interval = 60 notify_audit = false timeout_h = 0 diff --git a/system/linux/README.md b/system/linux/README.md new file mode 100644 index 0000000..54aac37 --- /dev/null +++ b/system/linux/README.md @@ -0,0 +1,43 @@ +System Linux DataDog monitors +============================= + +How to use this module +---------------------- + +``` +module "datadog-monitors-system-generic" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//system/linux?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* System CPU High +* System Free disk space +* System Free disk inodes +* System Free memory + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| cpu_high_threshold_critical | CPU high critical threshold | string | `95` | no | +| cpu_high_threshold_warning | CPU high warning threshold | string | `80` | no | +| custom_cpu_period | Set up period for the query | string | `last_5m` | no | +| environment | Architecture Environment | string | - | yes | +| evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| free_disk_inodes_threshold_critical | Free disk space critical threshold | string | `5` | no | +| free_disk_inodes_threshold_warning | Free disk space warning threshold | string | `10` | no | +| free_disk_space_threshold_critical | Free disk space critical threshold | string | `5` | no | +| free_disk_space_threshold_warning | Free disk space warning threshold | string | `10` | no | +| free_memory_threshold_critical | Free disk space critical threshold | string | `5` | no | +| free_memory_threshold_warning | Free disk space warning threshold | string | `10` | no | +| message | Message sent when an alert is triggered | string | - | yes | \ No newline at end of file diff --git a/system/linux/inputs.tf b/system/linux/inputs.tf deleted file mode 120000 index cdfc6c6..0000000 --- a/system/linux/inputs.tf +++ /dev/null @@ -1 +0,0 @@ -../../inputs.tf \ No newline at end of file diff --git a/system/linux/inputs.tf b/system/linux/inputs.tf new file mode 100644 index 0000000..de67079 --- /dev/null +++ b/system/linux/inputs.tf @@ -0,0 +1,72 @@ +# Global Terraform +variable "environment" { + description = "Architecture Environment" + type = "string" +} + +# Global DataDog +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "message" { + description = "Message sent when an alert is triggered" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# Custom CPU instance specific + +variable "custom_cpu_period" { + description = "Set up period for the query" + default = "last_5m" +} + +variable "cpu_high_threshold_warning" { + description = "CPU high warning threshold" + default = 80 +} + +variable "cpu_high_threshold_critical" { + description = "CPU high critical threshold" + default = 95 +} + +variable "free_disk_space_threshold_warning" { + description = "Free disk space warning threshold" + default = 10 +} + +variable "free_disk_space_threshold_critical" { + description = "Free disk space critical threshold" + default = 5 +} + +variable "free_disk_inodes_threshold_warning" { + description = "Free disk space warning threshold" + default = 10 +} + +variable "free_disk_inodes_threshold_critical" { + description = "Free disk space critical threshold" + default = 5 +} + +variable "free_memory_threshold_warning" { + description = "Free disk space warning threshold" + default = 10 +} + +variable "free_memory_threshold_critical" { + description = "Free disk space critical threshold" + default = 5 +} diff --git a/system/linux/monitors-linux-basics.tf b/system/linux/monitors-linux-basics.tf index 6260aed..6bd173d 100644 --- a/system/linux/monitors-linux-basics.tf +++ b/system/linux/monitors-linux-basics.tf @@ -1,21 +1,34 @@ -resource "datadog_monitor" "cpu_80_15min" { - name = "[${var.env}] CPU High > ${var.cpu_15_critical} for 15 min on {{host.name}}" - message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" - count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" +data "template_file" "filter" { + template = "$${filter}" - query = "min(last_15m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region} > ${var.cpu_15_critical}" - type = "query alert" + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_rds:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "datadog_cpu_too_high" { + name = "[${var.environment}] CPU High {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + message = "${var.message}" + + query = < ${var.cpu_high_threshold_critical} + EOF + + type = "metric alert" thresholds { - critical = "${var.cpu_15_critical}" + warning = "${var.cpu_high_threshold_warning}" + critical = "${var.cpu_high_threshold_critical}" } - tags = ["env:${var.env}", "type:system"] + tags = ["env:${var.environment}", "type:system"] - notify_no_data = "${var.linux_basics_config["notify_no_data"]}" - evaluation_delay = "${var.linux_basics_config["delay"]}" - new_host_delay = "${var.linux_basics_config["delay"]}" - renotify_interval = 60 + notify_no_data = true + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.evaluation_delay}" notify_audit = false timeout_h = 0 include_tags = true @@ -24,24 +37,29 @@ resource "datadog_monitor" "cpu_80_15min" { no_data_timeframe = 20 } -resource "datadog_monitor" "cpu_95_5min" { - name = "[${var.env}] CPU High > ${var.cpu_5_critical} for 5 min on {{host.name}}" - message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" +resource "datadog_monitor" "datadog_free_disk_space_too_low" { + name = "[${var.environment}] Free disk space {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + message = "${var.message}" - query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region} > ${var.cpu_5_critical}" - type = "query alert" - count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" + query = < Date: Thu, 22 Feb 2018 12:13:05 +0100 Subject: [PATCH 07/11] MON-96 - Updated all monitors to follow recommendations --- cloud/aws/elasticsearch/monitors-elasticsearch.tf | 4 ++-- cloud/aws/rds/monitors-rds.tf | 4 ++-- middleware/apache/README.md | 1 - middleware/apache/inputs.tf | 4 ---- middleware/apache/monitors-apache.tf | 8 ++++---- middleware/nginx/README.md | 1 - middleware/nginx/inputs.tf | 4 ---- middleware/nginx/monitors-nginx.tf | 8 ++++---- middleware/php-fpm/monitors-fpm.tf | 12 ++++++------ system/generic/monitors-custom-cpu.tf | 8 ++++---- system/linux/monitors-linux-basics.tf | 4 ++-- 11 files changed, 24 insertions(+), 34 deletions(-) diff --git a/cloud/aws/elasticsearch/monitors-elasticsearch.tf b/cloud/aws/elasticsearch/monitors-elasticsearch.tf index 8880a02..5a55f4b 100644 --- a/cloud/aws/elasticsearch/monitors-elasticsearch.tf +++ b/cloud/aws/elasticsearch/monitors-elasticsearch.tf @@ -33,7 +33,7 @@ EOF timeout_h = 0 include_tags = true locked = false - require_full_window = true + require_full_window = false new_host_delay = "${var.evaluation_delay}" no_data_timeframe = 20 @@ -66,7 +66,7 @@ EOF timeout_h = 0 include_tags = true locked = false - require_full_window = true + require_full_window = false new_host_delay = "${var.evaluation_delay}" no_data_timeframe = 20 diff --git a/cloud/aws/rds/monitors-rds.tf b/cloud/aws/rds/monitors-rds.tf index dffb178..430cf37 100644 --- a/cloud/aws/rds/monitors-rds.tf +++ b/cloud/aws/rds/monitors-rds.tf @@ -30,7 +30,7 @@ EOF timeout_h = 0 include_tags = true locked = false - require_full_window = true + require_full_window = false new_host_delay = "${var.evaluation_delay}" no_data_timeframe = 20 @@ -62,7 +62,7 @@ EOF timeout_h = 0 include_tags = true locked = false - require_full_window = true + require_full_window = false new_host_delay = "${var.evaluation_delay}" no_data_timeframe = 20 diff --git a/middleware/apache/README.md b/middleware/apache/README.md index 4d9c00f..aaf5630 100644 --- a/middleware/apache/README.md +++ b/middleware/apache/README.md @@ -24,7 +24,6 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| dd_apache | | string | `disabled` | no | | environment | Architecture Environment | string | - | yes | | evaluation_delay | Delay in seconds for the metric evaluation | string | `15` | no | | message | Message sent when an alert is triggered | string | - | yes | diff --git a/middleware/apache/inputs.tf b/middleware/apache/inputs.tf index 0c3b10e..ca3d259 100644 --- a/middleware/apache/inputs.tf +++ b/middleware/apache/inputs.tf @@ -15,7 +15,3 @@ variable "message" { } # Apache Middleware specific - -variable "dd_apache" { - default = "disabled" -} diff --git a/middleware/apache/monitors-apache.tf b/middleware/apache/monitors-apache.tf index 11f4624..2fc3126 100644 --- a/middleware/apache/monitors-apache.tf +++ b/middleware/apache/monitors-apache.tf @@ -3,7 +3,7 @@ resource "datadog_monitor" "Apache_process" { message = "${var.message}" type = "service check" - query = "\"process.up\".over(\"dd_monitoring:enabled\",\"dd_apache:enabled\",\"process:apache\",\"env:${var.environment}\").by(\"host\",\"process\", \"app\").last(4).count_by_status()" + query = "\"apache.can_connect\".over(\"dd_monitoring:enabled\",\"dd_apache:enabled\",\"process:apache\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" thresholds = { ok = 1 @@ -11,10 +11,10 @@ resource "datadog_monitor" "Apache_process" { critical = 4 } - notify_no_data = false + notify_no_data = true evaluation_delay = "${var.evaluation_delay}" new_host_delay = "${var.evaluation_delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -22,5 +22,5 @@ resource "datadog_monitor" "Apache_process" { require_full_window = true no_data_timeframe = 20 - tags = ["env:${var.environment}", "type:apache"] + tags = ["env:${var.environment}", "type:resource"] } diff --git a/middleware/nginx/README.md b/middleware/nginx/README.md index 435dde6..ccf7772 100644 --- a/middleware/nginx/README.md +++ b/middleware/nginx/README.md @@ -24,7 +24,6 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| dd_nginx | | string | `disabled` | no | | environment | Architecture Environment | string | - | yes | | evaluation_delay | Delay in seconds for the metric evaluation | string | `15` | no | | message | Message sent when an alert is triggered | string | - | yes | diff --git a/middleware/nginx/inputs.tf b/middleware/nginx/inputs.tf index f823913..ea23988 100644 --- a/middleware/nginx/inputs.tf +++ b/middleware/nginx/inputs.tf @@ -15,7 +15,3 @@ variable "message" { } # Nginx Middleware specific - -variable "dd_nginx" { - default = "disabled" -} diff --git a/middleware/nginx/monitors-nginx.tf b/middleware/nginx/monitors-nginx.tf index b018ec3..a569c60 100644 --- a/middleware/nginx/monitors-nginx.tf +++ b/middleware/nginx/monitors-nginx.tf @@ -3,7 +3,7 @@ resource "datadog_monitor" "Nginx_process" { message = "${var.message}" type = "service check" - query = "\"process.up\".over(\"dd_monitoring:enabled\",\"dd_nginx:enabled\",\"process:nginx\",\"env:${var.environment}\").by(\"host\",\"process\", \"app\").last(4).count_by_status()" + query = "\"nginx.can_connect\".over(\"dd_monitoring:enabled\",\"dd_nginx:enabled\",\"process:nginx\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" thresholds = { ok = 1 @@ -11,10 +11,10 @@ resource "datadog_monitor" "Nginx_process" { critical = 4 } - notify_no_data = false + notify_no_data = true evaluation_delay = "${var.evaluation_delay}" new_host_delay = "${var.evaluation_delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -22,5 +22,5 @@ resource "datadog_monitor" "Nginx_process" { require_full_window = true no_data_timeframe = 20 - tags = ["env:${var.environment}", "type:nginx"] + tags = ["env:${var.environment}", "type:resource"] } diff --git a/middleware/php-fpm/monitors-fpm.tf b/middleware/php-fpm/monitors-fpm.tf index 95289e8..2b06108 100644 --- a/middleware/php-fpm/monitors-fpm.tf +++ b/middleware/php-fpm/monitors-fpm.tf @@ -2,7 +2,7 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_rds:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_php_fpm:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } @@ -25,7 +25,7 @@ resource "datadog_monitor" "php-fpm_process_idle" { critical = "${var.php_fpm_busy_threshold_critical}" } - notify_no_data = false + notify_no_data = true evaluation_delay = "${var.evaluation_delay_metric}" new_host_delay = "${var.evaluation_delay_metric}" notify_audit = false @@ -35,7 +35,7 @@ resource "datadog_monitor" "php-fpm_process_idle" { require_full_window = true no_data_timeframe = 20 - tags = ["env:${var.environment}", "type:php-fpm"] + tags = ["env:${var.environment}", "type:resource"] } resource "datadog_monitor" "FPM_process" { @@ -43,7 +43,7 @@ resource "datadog_monitor" "FPM_process" { message = "${var.message}" type = "service check" - query = "\"process.up\".over(\"dd_monitoring:enabled\",\"dd_php_fpm:enabled\",\"process:php_fpm\",\"env:${var.environment}\").by(\"host\",\"process\", \"app\").last(4).count_by_status()" + query = "\"php_fpm.can_ping\".over(\"dd_monitoring:enabled\",\"dd_php_fpm:enabled\",\"process:php_fpm\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" thresholds = { ok = 1 @@ -54,7 +54,7 @@ resource "datadog_monitor" "FPM_process" { notify_no_data = true evaluation_delay = "${var.evaluation_delay_service}" new_host_delay = "${var.evaluation_delay_service}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -62,5 +62,5 @@ resource "datadog_monitor" "FPM_process" { require_full_window = true no_data_timeframe = 20 - tags = ["env:${var.environment}", "type:php-fpm"] + tags = ["env:${var.environment}", "type:resource"] } diff --git a/system/generic/monitors-custom-cpu.tf b/system/generic/monitors-custom-cpu.tf index 57b9b03..7f862cf 100644 --- a/system/generic/monitors-custom-cpu.tf +++ b/system/generic/monitors-custom-cpu.tf @@ -2,7 +2,7 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_rds:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_system:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } @@ -12,8 +12,8 @@ resource "datadog_monitor" "cpu_custom" { query = < ${var.custom_cpu_threshold_critical}" EOF @@ -27,7 +27,7 @@ resource "datadog_monitor" "cpu_custom" { notify_no_data = true evaluation_delay = "${var.evaluation_delay}" new_host_delay = "${var.evaluation_delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true diff --git a/system/linux/monitors-linux-basics.tf b/system/linux/monitors-linux-basics.tf index 6bd173d..d5762fe 100644 --- a/system/linux/monitors-linux-basics.tf +++ b/system/linux/monitors-linux-basics.tf @@ -2,7 +2,7 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_rds:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_linux:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } @@ -122,7 +122,7 @@ resource "datadog_monitor" "datadog_free_memory" { notify_no_data = true evaluation_delay = "${var.evaluation_delay}" new_host_delay = "${var.evaluation_delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true From 73d81d7053cfc62b3fc9afb38dc9eb74123971cf Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Thu, 22 Feb 2018 12:15:53 +0100 Subject: [PATCH 08/11] MON-96 - Renamed system's monitors --- middleware/apache/inputs.tf | 1 + middleware/nginx/inputs.tf | 1 + system/generic/{monitors-custom-cpu.tf => monitors-system.tf} | 0 system/linux/{monitors-linux-basics.tf => monitors-linux.tf} | 0 4 files changed, 2 insertions(+) rename system/generic/{monitors-custom-cpu.tf => monitors-system.tf} (100%) rename system/linux/{monitors-linux-basics.tf => monitors-linux.tf} (100%) diff --git a/middleware/apache/inputs.tf b/middleware/apache/inputs.tf index ca3d259..1812e3d 100644 --- a/middleware/apache/inputs.tf +++ b/middleware/apache/inputs.tf @@ -15,3 +15,4 @@ variable "message" { } # Apache Middleware specific + diff --git a/middleware/nginx/inputs.tf b/middleware/nginx/inputs.tf index ea23988..a82f6c2 100644 --- a/middleware/nginx/inputs.tf +++ b/middleware/nginx/inputs.tf @@ -15,3 +15,4 @@ variable "message" { } # Nginx Middleware specific + diff --git a/system/generic/monitors-custom-cpu.tf b/system/generic/monitors-system.tf similarity index 100% rename from system/generic/monitors-custom-cpu.tf rename to system/generic/monitors-system.tf diff --git a/system/linux/monitors-linux-basics.tf b/system/linux/monitors-linux.tf similarity index 100% rename from system/linux/monitors-linux-basics.tf rename to system/linux/monitors-linux.tf From 4fcbd5da7effa0b514714385a66d7e91b0001fdb Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Thu, 22 Feb 2018 14:56:25 +0100 Subject: [PATCH 09/11] MON-96 - Migrated system/linux monitors into system/generic --- .../elasticsearch/monitors-elasticsearch.tf | 2 +- middleware/apache/monitors-apache.tf | 8 +- middleware/nginx/monitors-nginx.tf | 8 +- middleware/php-fpm/monitors-fpm.tf | 14 +- system/generic/README.md | 12 +- system/generic/inputs.tf | 42 ++++- system/generic/monitors-system.tf | 111 ++++++++++- system/linux/README.md | 43 ----- system/linux/inputs.tf | 72 ------- system/linux/monitors-linux.tf | 178 ------------------ 10 files changed, 164 insertions(+), 326 deletions(-) delete mode 100644 system/linux/README.md delete mode 100644 system/linux/inputs.tf delete mode 100644 system/linux/monitors-linux.tf diff --git a/cloud/aws/elasticsearch/monitors-elasticsearch.tf b/cloud/aws/elasticsearch/monitors-elasticsearch.tf index 5a55f4b..b665926 100644 --- a/cloud/aws/elasticsearch/monitors-elasticsearch.tf +++ b/cloud/aws/elasticsearch/monitors-elasticsearch.tf @@ -98,7 +98,7 @@ EOF timeout_h = 0 include_tags = true locked = false - require_full_window = true + require_full_window = false new_host_delay = "${var.evaluation_delay}" no_data_timeframe = 20 diff --git a/middleware/apache/monitors-apache.tf b/middleware/apache/monitors-apache.tf index 2fc3126..b1f8d53 100644 --- a/middleware/apache/monitors-apache.tf +++ b/middleware/apache/monitors-apache.tf @@ -1,9 +1,9 @@ -resource "datadog_monitor" "Apache_process" { - name = "[${var.environment}] Apache process is down on {{host.name}}" +resource "datadog_monitor" "datadog_apache_process" { + name = "[${var.environment}] Can't connect to apache, process is not running on {{host.name}}" message = "${var.message}" type = "service check" - query = "\"apache.can_connect\".over(\"dd_monitoring:enabled\",\"dd_apache:enabled\",\"process:apache\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" + query = "\"apache.can_connect\".over(\"dd_monitoring:enabled\",\"dd_apache:enabled\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" thresholds = { ok = 1 @@ -22,5 +22,5 @@ resource "datadog_monitor" "Apache_process" { require_full_window = true no_data_timeframe = 20 - tags = ["env:${var.environment}", "type:resource"] + tags = ["env:${var.environment}", "resource:apache"] } diff --git a/middleware/nginx/monitors-nginx.tf b/middleware/nginx/monitors-nginx.tf index a569c60..f81a291 100644 --- a/middleware/nginx/monitors-nginx.tf +++ b/middleware/nginx/monitors-nginx.tf @@ -1,9 +1,9 @@ -resource "datadog_monitor" "Nginx_process" { - name = "[${var.environment}] Nginx process is down on {{host.name}}" +resource "datadog_monitor" "datadog_nginx_process" { + name = "[${var.environment}] Can't connect to nginx, process is not running on {{host.name}}" message = "${var.message}" type = "service check" - query = "\"nginx.can_connect\".over(\"dd_monitoring:enabled\",\"dd_nginx:enabled\",\"process:nginx\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" + query = "\"nginx.can_connect\".over(\"dd_monitoring:enabled\",\"dd_nginx:enabled\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" thresholds = { ok = 1 @@ -22,5 +22,5 @@ resource "datadog_monitor" "Nginx_process" { require_full_window = true no_data_timeframe = 20 - tags = ["env:${var.environment}", "type:resource"] + tags = ["env:${var.environment}", "resource:nginx"] } diff --git a/middleware/php-fpm/monitors-fpm.tf b/middleware/php-fpm/monitors-fpm.tf index 2b06108..e85f0e6 100644 --- a/middleware/php-fpm/monitors-fpm.tf +++ b/middleware/php-fpm/monitors-fpm.tf @@ -2,11 +2,11 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_php_fpm:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_php_fpm:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } -resource "datadog_monitor" "php-fpm_process_idle" { +resource "datadog_monitor" "datadog_php_fpm_process_idle" { name = "[${var.environment}] php_fpm busy worker {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" message = "${var.message}" @@ -35,15 +35,15 @@ resource "datadog_monitor" "php-fpm_process_idle" { require_full_window = true no_data_timeframe = 20 - tags = ["env:${var.environment}", "type:resource"] + tags = ["env:${var.environment}", "resource:php-fpm"] } -resource "datadog_monitor" "FPM_process" { - name = "[${var.environment}] FPM process is down on {{host.name}}" +resource "datadog_monitor" "datadog_fpm_process" { + name = "[${var.environment}] Can't ping FPM, process is not running on {{host.name}}" message = "${var.message}" type = "service check" - query = "\"php_fpm.can_ping\".over(\"dd_monitoring:enabled\",\"dd_php_fpm:enabled\",\"process:php_fpm\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" + query = "\"php_fpm.can_ping\".over(\"dd_monitoring:enabled\",\"dd_php_fpm:enabled\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" thresholds = { ok = 1 @@ -62,5 +62,5 @@ resource "datadog_monitor" "FPM_process" { require_full_window = true no_data_timeframe = 20 - tags = ["env:${var.environment}", "type:resource"] + tags = ["env:${var.environment}", "resource:php-fpm"] } diff --git a/system/generic/README.md b/system/generic/README.md index eb85357..abf80e0 100644 --- a/system/generic/README.md +++ b/system/generic/README.md @@ -24,11 +24,17 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| custom_cpu_period | Set up period for the query | string | `last_5m` | no | -| custom_cpu_threshold_critical | Custom CPU critical threshold | string | `95` | no | -| custom_cpu_threshold_warning | Custom CPU warning threshold | string | `80` | no | +| cpu_high_threshold_critical | CPU high critical threshold | string | `95` | no | +| cpu_high_threshold_warning | CPU high warning threshold | string | `80` | no | +| cpu_high_timeframe | CPU high timeframe | string | `last_5m` | no | | environment | Architecture Environment | string | - | yes | | evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| free_disk_inodes_threshold_critical | Free disk space critical threshold | string | `5` | no | +| free_disk_inodes_threshold_warning | Free disk space warning threshold | string | `10` | no | +| free_disk_space_threshold_critical | Free disk space critical threshold | string | `5` | no | +| free_disk_space_threshold_warning | Free disk space warning threshold | string | `10` | no | +| free_memory_threshold_critical | Free disk space critical threshold | string | `5` | no | +| free_memory_threshold_warning | Free disk space warning threshold | string | `10` | no | | message | Message sent when an alert is triggered | string | - | yes | diff --git a/system/generic/inputs.tf b/system/generic/inputs.tf index 06a118e..5b34296 100644 --- a/system/generic/inputs.tf +++ b/system/generic/inputs.tf @@ -26,17 +26,47 @@ variable "filter_tags_custom" { # Custom CPU instance specific -variable "custom_cpu_period" { - description = "Set up period for the query" +variable "cpu_high_timeframe" { + description = "CPU high timeframe" default = "last_5m" } -variable "custom_cpu_threshold_warning" { - description = "Custom CPU warning threshold" +variable "cpu_high_threshold_warning" { + description = "CPU high warning threshold" default = 80 } -variable "custom_cpu_threshold_critical" { - description = "Custom CPU critical threshold" +variable "cpu_high_threshold_critical" { + description = "CPU high critical threshold" default = 95 } + +variable "free_disk_space_threshold_warning" { + description = "Free disk space warning threshold" + default = 10 +} + +variable "free_disk_space_threshold_critical" { + description = "Free disk space critical threshold" + default = 5 +} + +variable "free_disk_inodes_threshold_warning" { + description = "Free disk space warning threshold" + default = 10 +} + +variable "free_disk_inodes_threshold_critical" { + description = "Free disk space critical threshold" + default = 5 +} + +variable "free_memory_threshold_warning" { + description = "Free disk space warning threshold" + default = 10 +} + +variable "free_memory_threshold_critical" { + description = "Free disk space critical threshold" + default = 5 +} diff --git a/system/generic/monitors-system.tf b/system/generic/monitors-system.tf index 7f862cf..473701c 100644 --- a/system/generic/monitors-system.tf +++ b/system/generic/monitors-system.tf @@ -6,24 +6,119 @@ data "template_file" "filter" { } } -resource "datadog_monitor" "cpu_custom" { +resource "datadog_monitor" "datadog_cpu_too_high" { name = "[${var.environment}] CPU too High {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" message = "${var.message}" query = < ${var.custom_cpu_threshold_critical}" + min(${var.cpu_high_timeframe}): ( + avg:system.cpu.system{${data.template_file.filter.rendered}} by {region,host} + + avg:system.cpu.user{${data.template_file.filter.rendered}} by {region,host} + ) > ${var.cpu_high_threshold_critical} EOF type = "metric alert" - thresholds = { - warning = "${var.custom_cpu_threshold_warning}" - critical = "${var.custom_cpu_threshold_critical}" + thresholds { + warning = "${var.cpu_high_threshold_warning}" + critical = "${var.cpu_high_threshold_critical}" } + tags = ["env:${var.environment}", "type:system"] + + notify_no_data = true + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.evaluation_delay}" + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + no_data_timeframe = 20 +} + +resource "datadog_monitor" "datadog_free_disk_space_too_low" { + name = "[${var.environment}] Free disk space {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + message = "${var.message}" + + query = < ${var.cpu_high_threshold_critical} - EOF - - type = "metric alert" - - thresholds { - warning = "${var.cpu_high_threshold_warning}" - critical = "${var.cpu_high_threshold_critical}" - } - - tags = ["env:${var.environment}", "type:system"] - - notify_no_data = true - evaluation_delay = "${var.evaluation_delay}" - new_host_delay = "${var.evaluation_delay}" - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - no_data_timeframe = 20 -} - -resource "datadog_monitor" "datadog_free_disk_space_too_low" { - name = "[${var.environment}] Free disk space {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" - message = "${var.message}" - - query = < Date: Thu, 22 Feb 2018 17:23:31 +0100 Subject: [PATCH 10/11] MON-96 - Fix middleware monitors names and update system monitors --- middleware/apache/monitors-apache.tf | 2 +- middleware/nginx/monitors-nginx.tf | 2 +- middleware/php-fpm/monitors-fpm.tf | 2 +- system/generic/monitors-system.tf | 17 ++++++++--------- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/middleware/apache/monitors-apache.tf b/middleware/apache/monitors-apache.tf index b1f8d53..d31cfd6 100644 --- a/middleware/apache/monitors-apache.tf +++ b/middleware/apache/monitors-apache.tf @@ -1,5 +1,5 @@ resource "datadog_monitor" "datadog_apache_process" { - name = "[${var.environment}] Can't connect to apache, process is not running on {{host.name}}" + name = "[${var.environment}] Can't connect to apache vhost status" message = "${var.message}" type = "service check" diff --git a/middleware/nginx/monitors-nginx.tf b/middleware/nginx/monitors-nginx.tf index f81a291..3db1c0f 100644 --- a/middleware/nginx/monitors-nginx.tf +++ b/middleware/nginx/monitors-nginx.tf @@ -1,5 +1,5 @@ resource "datadog_monitor" "datadog_nginx_process" { - name = "[${var.environment}] Can't connect to nginx, process is not running on {{host.name}}" + name = "[${var.environment}] Can't connect to nginx vhost status" message = "${var.message}" type = "service check" diff --git a/middleware/php-fpm/monitors-fpm.tf b/middleware/php-fpm/monitors-fpm.tf index e85f0e6..5b1568b 100644 --- a/middleware/php-fpm/monitors-fpm.tf +++ b/middleware/php-fpm/monitors-fpm.tf @@ -39,7 +39,7 @@ resource "datadog_monitor" "datadog_php_fpm_process_idle" { } resource "datadog_monitor" "datadog_fpm_process" { - name = "[${var.environment}] Can't ping FPM, process is not running on {{host.name}}" + name = "[${var.environment}] Can't connect to php-fpm" message = "${var.message}" type = "service check" diff --git a/system/generic/monitors-system.tf b/system/generic/monitors-system.tf index 473701c..43ef2cc 100644 --- a/system/generic/monitors-system.tf +++ b/system/generic/monitors-system.tf @@ -2,18 +2,17 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_system:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_system:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } resource "datadog_monitor" "datadog_cpu_too_high" { - name = "[${var.environment}] CPU too High {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + name = "[${var.environment}] CPU usage {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" message = "${var.message}" query = < ${var.cpu_high_threshold_critical} EOF @@ -24,7 +23,7 @@ resource "datadog_monitor" "datadog_cpu_too_high" { critical = "${var.cpu_high_threshold_critical}" } - tags = ["env:${var.environment}", "type:system"] + tags = ["env:${var.environment}", "type:system", "resource:cpu"] notify_no_data = true evaluation_delay = "${var.evaluation_delay}" @@ -55,7 +54,7 @@ resource "datadog_monitor" "datadog_free_disk_space_too_low" { critical = "${var.free_disk_space_threshold_critical}" } - tags = ["env:${var.environment}", "type:system"] + tags = ["env:${var.environment}", "type:system", "resource:disk"] notify_no_data = true evaluation_delay = "${var.evaluation_delay}" @@ -86,7 +85,7 @@ resource "datadog_monitor" "datadog_free_disk_space_inodes_too_low" { critical = "${var.free_disk_inodes_threshold_critical}" } - tags = ["env:${var.environment}", "type:system"] + tags = ["env:${var.environment}", "type:system", "resource:disk"] notify_no_data = true evaluation_delay = "${var.evaluation_delay}" @@ -101,7 +100,7 @@ resource "datadog_monitor" "datadog_free_disk_space_inodes_too_low" { resource "datadog_monitor" "datadog_free_memory" { name = "[${var.environment}] Free memory {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" - message = "Debugging alert - no escalation" + message = "${var.message}" query = < Date: Thu, 22 Feb 2018 17:49:34 +0100 Subject: [PATCH 11/11] MON-96 - Update queries system monitors with min() --- system/generic/monitors-system.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/system/generic/monitors-system.tf b/system/generic/monitors-system.tf index 43ef2cc..0a09ffd 100644 --- a/system/generic/monitors-system.tf +++ b/system/generic/monitors-system.tf @@ -41,7 +41,7 @@ resource "datadog_monitor" "datadog_free_disk_space_too_low" { message = "${var.message}" query = <