From 4fcbd5da7effa0b514714385a66d7e91b0001fdb Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Thu, 22 Feb 2018 14:56:25 +0100 Subject: [PATCH] MON-96 - Migrated system/linux monitors into system/generic --- .../elasticsearch/monitors-elasticsearch.tf | 2 +- middleware/apache/monitors-apache.tf | 8 +- middleware/nginx/monitors-nginx.tf | 8 +- middleware/php-fpm/monitors-fpm.tf | 14 +- system/generic/README.md | 12 +- system/generic/inputs.tf | 42 ++++- system/generic/monitors-system.tf | 111 ++++++++++- system/linux/README.md | 43 ----- system/linux/inputs.tf | 72 ------- system/linux/monitors-linux.tf | 178 ------------------ 10 files changed, 164 insertions(+), 326 deletions(-) delete mode 100644 system/linux/README.md delete mode 100644 system/linux/inputs.tf delete mode 100644 system/linux/monitors-linux.tf diff --git a/cloud/aws/elasticsearch/monitors-elasticsearch.tf b/cloud/aws/elasticsearch/monitors-elasticsearch.tf index 5a55f4b..b665926 100644 --- a/cloud/aws/elasticsearch/monitors-elasticsearch.tf +++ b/cloud/aws/elasticsearch/monitors-elasticsearch.tf @@ -98,7 +98,7 @@ EOF timeout_h = 0 include_tags = true locked = false - require_full_window = true + require_full_window = false new_host_delay = "${var.evaluation_delay}" no_data_timeframe = 20 diff --git a/middleware/apache/monitors-apache.tf b/middleware/apache/monitors-apache.tf index 2fc3126..b1f8d53 100644 --- a/middleware/apache/monitors-apache.tf +++ b/middleware/apache/monitors-apache.tf @@ -1,9 +1,9 @@ -resource "datadog_monitor" "Apache_process" { - name = "[${var.environment}] Apache process is down on {{host.name}}" +resource "datadog_monitor" "datadog_apache_process" { + name = "[${var.environment}] Can't connect to apache, process is not running on {{host.name}}" message = "${var.message}" type = "service check" - query = "\"apache.can_connect\".over(\"dd_monitoring:enabled\",\"dd_apache:enabled\",\"process:apache\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" + query = "\"apache.can_connect\".over(\"dd_monitoring:enabled\",\"dd_apache:enabled\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" thresholds = { ok = 1 @@ -22,5 +22,5 @@ resource "datadog_monitor" "Apache_process" { require_full_window = true no_data_timeframe = 20 - tags = ["env:${var.environment}", "type:resource"] + tags = ["env:${var.environment}", "resource:apache"] } diff --git a/middleware/nginx/monitors-nginx.tf b/middleware/nginx/monitors-nginx.tf index a569c60..f81a291 100644 --- a/middleware/nginx/monitors-nginx.tf +++ b/middleware/nginx/monitors-nginx.tf @@ -1,9 +1,9 @@ -resource "datadog_monitor" "Nginx_process" { - name = "[${var.environment}] Nginx process is down on {{host.name}}" +resource "datadog_monitor" "datadog_nginx_process" { + name = "[${var.environment}] Can't connect to nginx, process is not running on {{host.name}}" message = "${var.message}" type = "service check" - query = "\"nginx.can_connect\".over(\"dd_monitoring:enabled\",\"dd_nginx:enabled\",\"process:nginx\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" + query = "\"nginx.can_connect\".over(\"dd_monitoring:enabled\",\"dd_nginx:enabled\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" thresholds = { ok = 1 @@ -22,5 +22,5 @@ resource "datadog_monitor" "Nginx_process" { require_full_window = true no_data_timeframe = 20 - tags = ["env:${var.environment}", "type:resource"] + tags = ["env:${var.environment}", "resource:nginx"] } diff --git a/middleware/php-fpm/monitors-fpm.tf b/middleware/php-fpm/monitors-fpm.tf index 2b06108..e85f0e6 100644 --- a/middleware/php-fpm/monitors-fpm.tf +++ b/middleware/php-fpm/monitors-fpm.tf @@ -2,11 +2,11 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_php_fpm:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_php_fpm:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } -resource "datadog_monitor" "php-fpm_process_idle" { +resource "datadog_monitor" "datadog_php_fpm_process_idle" { name = "[${var.environment}] php_fpm busy worker {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" message = "${var.message}" @@ -35,15 +35,15 @@ resource "datadog_monitor" "php-fpm_process_idle" { require_full_window = true no_data_timeframe = 20 - tags = ["env:${var.environment}", "type:resource"] + tags = ["env:${var.environment}", "resource:php-fpm"] } -resource "datadog_monitor" "FPM_process" { - name = "[${var.environment}] FPM process is down on {{host.name}}" +resource "datadog_monitor" "datadog_fpm_process" { + name = "[${var.environment}] Can't ping FPM, process is not running on {{host.name}}" message = "${var.message}" type = "service check" - query = "\"php_fpm.can_ping\".over(\"dd_monitoring:enabled\",\"dd_php_fpm:enabled\",\"process:php_fpm\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" + query = "\"php_fpm.can_ping\".over(\"dd_monitoring:enabled\",\"dd_php_fpm:enabled\",\"env:${var.environment}\").by(\"host\",\"port\").last(6).count_by_status()" thresholds = { ok = 1 @@ -62,5 +62,5 @@ resource "datadog_monitor" "FPM_process" { require_full_window = true no_data_timeframe = 20 - tags = ["env:${var.environment}", "type:resource"] + tags = ["env:${var.environment}", "resource:php-fpm"] } diff --git a/system/generic/README.md b/system/generic/README.md index eb85357..abf80e0 100644 --- a/system/generic/README.md +++ b/system/generic/README.md @@ -24,11 +24,17 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| custom_cpu_period | Set up period for the query | string | `last_5m` | no | -| custom_cpu_threshold_critical | Custom CPU critical threshold | string | `95` | no | -| custom_cpu_threshold_warning | Custom CPU warning threshold | string | `80` | no | +| cpu_high_threshold_critical | CPU high critical threshold | string | `95` | no | +| cpu_high_threshold_warning | CPU high warning threshold | string | `80` | no | +| cpu_high_timeframe | CPU high timeframe | string | `last_5m` | no | | environment | Architecture Environment | string | - | yes | | evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| free_disk_inodes_threshold_critical | Free disk space critical threshold | string | `5` | no | +| free_disk_inodes_threshold_warning | Free disk space warning threshold | string | `10` | no | +| free_disk_space_threshold_critical | Free disk space critical threshold | string | `5` | no | +| free_disk_space_threshold_warning | Free disk space warning threshold | string | `10` | no | +| free_memory_threshold_critical | Free disk space critical threshold | string | `5` | no | +| free_memory_threshold_warning | Free disk space warning threshold | string | `10` | no | | message | Message sent when an alert is triggered | string | - | yes | diff --git a/system/generic/inputs.tf b/system/generic/inputs.tf index 06a118e..5b34296 100644 --- a/system/generic/inputs.tf +++ b/system/generic/inputs.tf @@ -26,17 +26,47 @@ variable "filter_tags_custom" { # Custom CPU instance specific -variable "custom_cpu_period" { - description = "Set up period for the query" +variable "cpu_high_timeframe" { + description = "CPU high timeframe" default = "last_5m" } -variable "custom_cpu_threshold_warning" { - description = "Custom CPU warning threshold" +variable "cpu_high_threshold_warning" { + description = "CPU high warning threshold" default = 80 } -variable "custom_cpu_threshold_critical" { - description = "Custom CPU critical threshold" +variable "cpu_high_threshold_critical" { + description = "CPU high critical threshold" default = 95 } + +variable "free_disk_space_threshold_warning" { + description = "Free disk space warning threshold" + default = 10 +} + +variable "free_disk_space_threshold_critical" { + description = "Free disk space critical threshold" + default = 5 +} + +variable "free_disk_inodes_threshold_warning" { + description = "Free disk space warning threshold" + default = 10 +} + +variable "free_disk_inodes_threshold_critical" { + description = "Free disk space critical threshold" + default = 5 +} + +variable "free_memory_threshold_warning" { + description = "Free disk space warning threshold" + default = 10 +} + +variable "free_memory_threshold_critical" { + description = "Free disk space critical threshold" + default = 5 +} diff --git a/system/generic/monitors-system.tf b/system/generic/monitors-system.tf index 7f862cf..473701c 100644 --- a/system/generic/monitors-system.tf +++ b/system/generic/monitors-system.tf @@ -6,24 +6,119 @@ data "template_file" "filter" { } } -resource "datadog_monitor" "cpu_custom" { +resource "datadog_monitor" "datadog_cpu_too_high" { name = "[${var.environment}] CPU too High {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" message = "${var.message}" query = < ${var.custom_cpu_threshold_critical}" + min(${var.cpu_high_timeframe}): ( + avg:system.cpu.system{${data.template_file.filter.rendered}} by {region,host} + + avg:system.cpu.user{${data.template_file.filter.rendered}} by {region,host} + ) > ${var.cpu_high_threshold_critical} EOF type = "metric alert" - thresholds = { - warning = "${var.custom_cpu_threshold_warning}" - critical = "${var.custom_cpu_threshold_critical}" + thresholds { + warning = "${var.cpu_high_threshold_warning}" + critical = "${var.cpu_high_threshold_critical}" } + tags = ["env:${var.environment}", "type:system"] + + notify_no_data = true + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.evaluation_delay}" + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + no_data_timeframe = 20 +} + +resource "datadog_monitor" "datadog_free_disk_space_too_low" { + name = "[${var.environment}] Free disk space {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + message = "${var.message}" + + query = < ${var.cpu_high_threshold_critical} - EOF - - type = "metric alert" - - thresholds { - warning = "${var.cpu_high_threshold_warning}" - critical = "${var.cpu_high_threshold_critical}" - } - - tags = ["env:${var.environment}", "type:system"] - - notify_no_data = true - evaluation_delay = "${var.evaluation_delay}" - new_host_delay = "${var.evaluation_delay}" - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - no_data_timeframe = 20 -} - -resource "datadog_monitor" "datadog_free_disk_space_too_low" { - name = "[${var.environment}] Free disk space {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" - message = "${var.message}" - - query = <