From 6d82a378cc5b3f9fc92722c7cc3d538e7a7f9580 Mon Sep 17 00:00:00 2001 From: vincent EL KHATIB Date: Mon, 11 Sep 2017 14:36:17 +0200 Subject: [PATCH 1/4] MON-49 change with new monitors --- inputs.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inputs.tf b/inputs.tf index 02cf08f..b8f20c9 100644 --- a/inputs.tf +++ b/inputs.tf @@ -77,4 +77,4 @@ variable "elb_backend_latency" { warning = 1000 critical = 5000 } -} \ No newline at end of file +} From b0ccd4e89259e2a887bfd0fc7cd591fd17876921 Mon Sep 17 00:00:00 2001 From: vincent EL KHATIB Date: Mon, 10 Jul 2017 11:10:42 +0200 Subject: [PATCH 2/4] fix inputs values --- inputs.tf | 13 +++++++++++++ monitors-nginx-basics.tf | 26 ++++++++++++++++++++++++++ monitors-php-basics.tf | 27 +++++++++++++++++++++++++++ 3 files changed, 66 insertions(+) create mode 100644 monitors-nginx-basics.tf create mode 100644 monitors-php-basics.tf diff --git a/inputs.tf b/inputs.tf index b8f20c9..73c7019 100644 --- a/inputs.tf +++ b/inputs.tf @@ -24,6 +24,7 @@ variable "dd_custom_cpu" { } + ## RDS variable "dd_aws_rds" { default = "disabled" @@ -49,6 +50,7 @@ variable "rds_mem_threshold" { } } + ## ELB variable "dd_aws_elb" { default = "disable" @@ -78,3 +80,14 @@ variable "elb_backend_latency" { critical = 5000 } } + +##apache nginx php +variable "dd_apache_basics" { + default = "disabled" +} +variable "dd_nginx_basics" { + default = "disabled" +} +variable "dd_php_basics" { + default = "disabled" +} \ No newline at end of file diff --git a/monitors-nginx-basics.tf b/monitors-nginx-basics.tf new file mode 100644 index 0000000..ccf2f7c --- /dev/null +++ b/monitors-nginx-basics.tf @@ -0,0 +1,26 @@ +resource "datadog_monitor" "Nginx_process" { + name = "Nginx process is down on {{host.name}}" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" + + + type = "service check" + query = "\"process.up\".over(\"dd_monitoring:enabled"\,\"process:nginx\").exclude(\"dd_custom_nginx:enabled\").last(4).count_by_status()" + count = "${var.dd_nginx_basics == "true" ? 1 : 0 }" + + thresholds = { + ok = 1 + warning = 2 + critical = 4 + } + + notify_no_data = false + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + notify_no_data = false + renotify_interval = 0 + no_data_timeframe = 20 +} diff --git a/monitors-php-basics.tf b/monitors-php-basics.tf new file mode 100644 index 0000000..7a9651c --- /dev/null +++ b/monitors-php-basics.tf @@ -0,0 +1,27 @@ +resource "datadog_monitor" "php-fpm_process_idle" { + name = "php_fpm busy worker > 99% on {{host.name}}" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" + + + type = "service check" + query = "avg(last_10m):avg:php_fpm.processes.active{\"dd_monitoring:enabled\",\"dd_php_basics:enabled\",\"!dd_custom_php:enabled\"} by {host} / ( avg:php_fpm.processes.idle{\"dd_monitoring:enabled\",\"dd_php_basics:enabled\",\"!dd_custom_php:enabled\"} by {host} + avg:php_fpm.processes.active{\"dd_monitoring:enabled\",\"dd_php_basics:enabled\",\"!dd_custom_php:enabled\"} by {host} ) > 0.99" + count = "${var.dd_php_basics == "true" ? 1 : 0 }" + + thresholds { + warning = 0.95 + critical = 0.99 + } + + notify_no_data = false + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = 300 + renotify_interval = 0 + no_data_timeframe = 20 +} + + From e7b53ccee4a732bd6aa892548a6806f9cfa20fb2 Mon Sep 17 00:00:00 2001 From: vincent EL KHATIB Date: Mon, 11 Sep 2017 15:17:10 +0200 Subject: [PATCH 3/4] MON-27 change inputs --- inputs.tf | 1 - monitors-nginx-basics.tf => monitors_nginx.tf | 2 +- monitors-php-basics.tf => monitors_php.tf | 0 3 files changed, 1 insertion(+), 2 deletions(-) rename monitors-nginx-basics.tf => monitors_nginx.tf (85%) rename monitors-php-basics.tf => monitors_php.tf (100%) diff --git a/inputs.tf b/inputs.tf index 73c7019..75cb5f1 100644 --- a/inputs.tf +++ b/inputs.tf @@ -24,7 +24,6 @@ variable "dd_custom_cpu" { } - ## RDS variable "dd_aws_rds" { default = "disabled" diff --git a/monitors-nginx-basics.tf b/monitors_nginx.tf similarity index 85% rename from monitors-nginx-basics.tf rename to monitors_nginx.tf index ccf2f7c..3d36a99 100644 --- a/monitors-nginx-basics.tf +++ b/monitors_nginx.tf @@ -4,7 +4,7 @@ resource "datadog_monitor" "Nginx_process" { type = "service check" - query = "\"process.up\".over(\"dd_monitoring:enabled"\,\"process:nginx\").exclude(\"dd_custom_nginx:enabled\").last(4).count_by_status()" + query = "process.up.over(dd_monitoring:enabled,process:nginx).exclude(dd_custom_nginx:enabled).last(4).count_by_status()" count = "${var.dd_nginx_basics == "true" ? 1 : 0 }" thresholds = { diff --git a/monitors-php-basics.tf b/monitors_php.tf similarity index 100% rename from monitors-php-basics.tf rename to monitors_php.tf From a1f78f071c195408b131d8e83e7fd988e3c37163 Mon Sep 17 00:00:00 2001 From: vincent EL KHATIB Date: Mon, 11 Sep 2017 17:45:40 +0200 Subject: [PATCH 4/4] MON-27 change monitors --- inputs.tf | 21 +++++++++++++++--- monitors_apache.tf | 26 ++++++++++++++++++++++ monitors_fpm.tf | 54 ++++++++++++++++++++++++++++++++++++++++++++++ monitors_nginx.tf | 14 ++++++------ monitors_php.tf | 27 ----------------------- 5 files changed, 105 insertions(+), 37 deletions(-) create mode 100644 monitors_apache.tf create mode 100644 monitors_fpm.tf delete mode 100644 monitors_php.tf diff --git a/inputs.tf b/inputs.tf index 75cb5f1..03b1829 100644 --- a/inputs.tf +++ b/inputs.tf @@ -81,12 +81,27 @@ variable "elb_backend_latency" { } ##apache nginx php -variable "dd_apache_basics" { +variable "dd_nginx" { default = "disabled" } -variable "dd_nginx_basics" { +variable "dd_php_fpm" { default = "disabled" } -variable "dd_php_basics" { + +variable "dd_apache" { default = "disabled" +} +variable "apache_nginx_fpm_config" { + type = "map" + default = { + notify_no_data = false + delay = 900 + } +} +variable "php_fpm_busy_threshold" { + type = "map" + default = { + warning = 0.8 + critical = 0.9 + } } \ No newline at end of file diff --git a/monitors_apache.tf b/monitors_apache.tf new file mode 100644 index 0000000..b318de2 --- /dev/null +++ b/monitors_apache.tf @@ -0,0 +1,26 @@ +resource "datadog_monitor" "Apache_process" { + name = "[${var.env}] Nginx process is down on {{host.name}}" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" + + + type = "service check" + query = "\"process.up\".over(\"dd_monitoring:enabled\",\"dd_apache:enabled\",\"process:apache\",\"env:${var.env}\").by(\"host\",\"process\", \"app\").last(4).count_by_status()" + count = "${var.dd_apache == "enabled" ? 1 : 0 }" + + thresholds = { + ok = 1 + warning = 2 + critical = 4 + } + + notify_no_data = "${var.apache_nginx_fpm_config["notify_no_data"]}" + evaluation_delay = "${var.apache_nginx_fpm_config["delay"]}" + new_host_delay = "${var.apache_nginx_fpm_config["delay"]}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + no_data_timeframe = 20 +} diff --git a/monitors_fpm.tf b/monitors_fpm.tf new file mode 100644 index 0000000..cff10a0 --- /dev/null +++ b/monitors_fpm.tf @@ -0,0 +1,54 @@ +resource "datadog_monitor" "php-fpm_process_idle" { + name = "[${var.env}] php_fpm busy worker > 90% on {{host.name}}" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" + + + type = "query alert" + query = "avg(last_10m):avg:php_fpm.processes.active{dd_monitoring:enabled,dd_php_fpm:enabled,env:${var.env}} by {host,region,app} / ( avg:php_fpm.processes.idle{dd_monitoring:enabled,dd_php_fpm:enabled,env:${var.env}} by {host,region,app} + avg:php_fpm.processes.active{dd_monitoring:enabled,dd_php_fpm:enabled,env:${var.env}} by {host,region,stack} ) > 0.90" + count = "${var.dd_php_fpm == "enabled" ? 1 : 0 }" + + thresholds { + warning = "${var.php_fpm_busy_threshold["warning"]}" + critical = "${var.php_fpm_busy_threshold["critical"]}" + } + + notify_no_data = "${var.apache_nginx_fpm_config["notify_no_data"]}" + evaluation_delay = "${var.apache_nginx_fpm_config["delay"]}" + new_host_delay = "${var.apache_nginx_fpm_config["delay"]}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + renotify_interval = 0 + no_data_timeframe = 20 +} + + +resource "datadog_monitor" "FPM_process" { + name = "[${var.env}] FPM process is down on {{host.name}}" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" + + + type = "service check" + query = "\"process.up\".over(\"dd_monitoring:enabled\",\"dd_php_fpm:enabled\",\"process:php_fpm\",\"env:${var.env}\").by(\"host\",\"process\", \"app\").last(4).count_by_status()" + count = "${var.dd_nginx == "enabled" ? 1 : 0 }" + + thresholds = { + ok = 1 + warning = 2 + critical = 4 + } + + notify_no_data = "${var.apache_nginx_fpm_config["notify_no_data"]}" + evaluation_delay = "${var.apache_nginx_fpm_config["delay"]}" + new_host_delay = "${var.apache_nginx_fpm_config["delay"]}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + no_data_timeframe = 20 +} diff --git a/monitors_nginx.tf b/monitors_nginx.tf index 3d36a99..01a95fe 100644 --- a/monitors_nginx.tf +++ b/monitors_nginx.tf @@ -1,11 +1,11 @@ resource "datadog_monitor" "Nginx_process" { - name = "Nginx process is down on {{host.name}}" + name = "[${var.env}] Nginx process is down on {{host.name}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" type = "service check" - query = "process.up.over(dd_monitoring:enabled,process:nginx).exclude(dd_custom_nginx:enabled).last(4).count_by_status()" - count = "${var.dd_nginx_basics == "true" ? 1 : 0 }" + query = "\"process.up\".over(\"dd_monitoring:enabled\",\"dd_nginx:enabled\",\"process:nginx\",\"env:${var.env}\").by(\"host\",\"process\", \"app\").last(4).count_by_status()" + count = "${var.dd_nginx == "enabled" ? 1 : 0 }" thresholds = { ok = 1 @@ -13,14 +13,14 @@ resource "datadog_monitor" "Nginx_process" { critical = 4 } - notify_no_data = false + notify_no_data = "${var.apache_nginx_fpm_config["notify_no_data"]}" + evaluation_delay = "${var.apache_nginx_fpm_config["delay"]}" + new_host_delay = "${var.apache_nginx_fpm_config["delay"]}" + renotify_interval = 60 notify_audit = false timeout_h = 0 include_tags = true locked = false require_full_window = true - new_host_delay = 300 - notify_no_data = false - renotify_interval = 0 no_data_timeframe = 20 } diff --git a/monitors_php.tf b/monitors_php.tf deleted file mode 100644 index 7a9651c..0000000 --- a/monitors_php.tf +++ /dev/null @@ -1,27 +0,0 @@ -resource "datadog_monitor" "php-fpm_process_idle" { - name = "php_fpm busy worker > 99% on {{host.name}}" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" - - - type = "service check" - query = "avg(last_10m):avg:php_fpm.processes.active{\"dd_monitoring:enabled\",\"dd_php_basics:enabled\",\"!dd_custom_php:enabled\"} by {host} / ( avg:php_fpm.processes.idle{\"dd_monitoring:enabled\",\"dd_php_basics:enabled\",\"!dd_custom_php:enabled\"} by {host} + avg:php_fpm.processes.active{\"dd_monitoring:enabled\",\"dd_php_basics:enabled\",\"!dd_custom_php:enabled\"} by {host} ) > 0.99" - count = "${var.dd_php_basics == "true" ? 1 : 0 }" - - thresholds { - warning = 0.95 - critical = 0.99 - } - - notify_no_data = false - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = 300 - renotify_interval = 0 - no_data_timeframe = 20 -} - -