diff --git a/cloud/aws/elb/monitors-elb.tf b/cloud/aws/elb/monitors-elb.tf index acabd1a..1f4d548 100644 --- a/cloud/aws/elb/monitors-elb.tf +++ b/cloud/aws/elb/monitors-elb.tf @@ -122,5 +122,3 @@ resource "datadog_monitor" "ELB_backend_latency" { tags = ["*"] } - - diff --git a/cloud/aws/rds-mysql/monitors-rds_mysql-basics.tf b/cloud/aws/rds-mysql/monitors-rds_mysql-basics.tf index 95557e0..87e0746 100644 --- a/cloud/aws/rds-mysql/monitors-rds_mysql-basics.tf +++ b/cloud/aws/rds-mysql/monitors-rds_mysql-basics.tf @@ -1,13 +1,9 @@ - - - resource "datadog_monitor" "rds-mysql_cpu_80_15min" { name = "[${var.env}] rds Cpu high > 90% for 15 min on {{host.identifier}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" count = "${var.dd_aws_rds == "enabled" ? 1 : 0 }" - query = "avg(last_15m):avg:aws.rds.cpuutilization{dd_monitoring:enabled,dd_aws_rds:enabled,env:${var.env}} by {region,name} > 90" type = "query alert" @@ -33,12 +29,11 @@ resource "datadog_monitor" "rds-mysql_cpu_80_15min" { resource "datadog_monitor" "mysql_rds_free_space_low" { name = "[${var.env}] rds free space low < 10 % on {{host.identifier}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" - + type = "query alert" query = "avg(last_15m):avg:aws.rds.free_storage_space{dd_monitoring:enabled,dd_aws_rds:enabled,env:${var.env}} by {region,name} / avg:aws.rds.total_storage_space{dd_monitoring:enabled,dd_aws_rds:enabled,env:${var.env}} by {region,name} * 100 < 10" count = "${var.dd_aws_rds == "enabled" ? 1 : 0 }" - thresholds { warning = "${var.rds_mem_threshold["warning"]}" critical = "${var.rds_mem_threshold["critical"]}" @@ -56,6 +51,4 @@ resource "datadog_monitor" "mysql_rds_free_space_low" { require_full_window = true new_host_delay = "${var.rds_config["delay"]}" no_data_timeframe = 20 - - -} \ No newline at end of file +} diff --git a/incubator/monitors-cassandra.tf b/incubator/monitors-cassandra.tf index 2911778..242c093 100644 --- a/incubator/monitors-cassandra.tf +++ b/incubator/monitors-cassandra.tf @@ -1,8 +1,8 @@ resource "datadog_monitor" "datadog_cassandra_down" { - name = "Cassandra service is down" + name = "Cassandra service is down" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" query = "\"cassandra.can_connect\".over(\"cassandra-node\").by(\"host\",\"instance\").last(2).count_by_status()" - type = "service check" + type = "service check" notify_no_data = false renotify_interval = 60 @@ -36,3 +36,4 @@ renotify_interval = 0 no_data_timeframe = 20 }*/ + diff --git a/incubator/monitors-gcp-lb.tf b/incubator/monitors-gcp-lb.tf index 71e7593..d1c6a27 100644 --- a/incubator/monitors-gcp-lb.tf +++ b/incubator/monitors-gcp-lb.tf @@ -1,8 +1,8 @@ resource "datadog_monitor" "datadog_gcp_lb_request_count" { - name = "GCP LoadBalancer request count changed too fast" + name = "GCP LoadBalancer request count changed too fast" message = "" query = "change(sum(last_5m),last_30m):avg:gcp.loadbalancing.http.request_count{*} by {backend_name}.as_count() >= 300" - type = "query alert" + type = "query alert" notify_no_data = false renotify_interval = 60 @@ -15,15 +15,13 @@ resource "datadog_monitor" "datadog_gcp_lb_request_count" { notify_no_data = false renotify_interval = 0 no_data_timeframe = 20 - } - resource "datadog_monitor" "datadog_gcp_lb_500" { - name = "GCP LoadBalancer 500 ratio > 5%" + name = "GCP LoadBalancer 500 ratio > 5%" message = "" query = "sum(last_10m):avg:gcp.loadbalancing.http.request_count{response_code_class:500} by {backend_name}.as_count() / avg:gcp.loadbalancing.http.request_count{*} by {backend_name}.as_count() > 0.2" - type = "query alert" + type = "query alert" notify_no_data = false renotify_interval = 60 @@ -36,15 +34,13 @@ resource "datadog_monitor" "datadog_gcp_lb_500" { notify_no_data = false renotify_interval = 0 no_data_timeframe = 20 - } - resource "datadog_monitor" "datadog_gcp_lb_backend_latency" { - name = "GCP LB backend latency > 2s" + name = "GCP LB backend latency > 2s" message = "" query = "min(last_5m):avg:gcp.loadbalancing.http.backend_latencies.avg{*} by {backend_name} > 2000" - type = "metric alert" + type = "metric alert" notify_no_data = false renotify_interval = 60 @@ -57,15 +53,13 @@ resource "datadog_monitor" "datadog_gcp_lb_backend_latency" { notify_no_data = false renotify_interval = 0 no_data_timeframe = 20 - } - resource "datadog_monitor" "datadog_gcp_lb_latency" { - name = "GCP LB latency > 5s" + name = "GCP LB latency > 5s" message = "" query = "avg(last_5m):avg:gcp.loadbalancing.http.total_latencies.avg{*} > 5000" - type = "query alert" + type = "query alert" thresholds { warning = 3000 @@ -83,5 +77,4 @@ resource "datadog_monitor" "datadog_gcp_lb_latency" { notify_no_data = false renotify_interval = 0 no_data_timeframe = 20 - } diff --git a/inputs.tf b/inputs.tf index 5e6720b..df8219c 100644 --- a/inputs.tf +++ b/inputs.tf @@ -4,20 +4,23 @@ variable "ho_escalation_group" {} variable env {} variable region {} - ##linux variable "dd_linux_basics" { default = "disabled" } + variable "linux_basics_config" { type = "map" + default = { notify_no_data = false - delay = 900 + delay = 900 } } + variable "dd_custom_cpu" { type = "map" + default = { status = "disabled" } @@ -37,55 +40,64 @@ variable "cpu_15_critical" { variable "dd_aws_rds" { default = "disabled" } + variable "rds_config" { type = "map" + default = { notify_no_data = false - delay = 900 - } -} -variable "rds_cpu_threshold" { - type = "map" - default = { - warning = 80 - critical = 90 - } -} -variable "rds_mem_threshold" { - default = { - warning = 20 - critical = 10 + delay = 900 } } +variable "rds_cpu_threshold" { + type = "map" + + default = { + warning = 80 + critical = 90 + } +} + +variable "rds_mem_threshold" { + default = { + warning = 20 + critical = 10 + } +} ## ELB variable "dd_aws_elb" { default = "disable" } + variable "elb_config" { type = "map" + default = { notify_no_data = false - delay = 900 + delay = 900 } } + variable "elb_5xx_threshold" { default = { - warning = 5 - critical = 10 + warning = 5 + critical = 10 } } + variable "elb_4xx_threshold" { default = { - warning = 5 - critical = 10 + warning = 5 + critical = 10 } } + variable "elb_backend_latency" { default = { - warning = 1000 - critical = 5000 + warning = 1000 + critical = 5000 } } @@ -93,6 +105,7 @@ variable "elb_backend_latency" { variable "dd_nginx" { default = "disabled" } + variable "dd_php_fpm" { default = "disabled" } @@ -100,17 +113,21 @@ variable "dd_php_fpm" { variable "dd_apache" { default = "disabled" } + variable "apache_nginx_fpm_config" { type = "map" + default = { notify_no_data = false - delay = 900 + delay = 900 } } + variable "php_fpm_busy_threshold" { type = "map" + default = { - warning = 0.8 - critical = 0.9 + warning = 0.8 + critical = 0.9 } -} \ No newline at end of file +} diff --git a/middleware/apache/monitors-apache.tf b/middleware/apache/monitors-apache.tf index d807312..a7261c1 100644 --- a/middleware/apache/monitors-apache.tf +++ b/middleware/apache/monitors-apache.tf @@ -2,14 +2,13 @@ resource "datadog_monitor" "Apache_process" { name = "[${var.env}] Nginx process is down on {{host.name}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" - type = "service check" query = "\"process.up\".over(\"dd_monitoring:enabled\",\"dd_apache:enabled\",\"process:apache\",\"env:${var.env}\").by(\"host\",\"process\", \"app\").last(4).count_by_status()" count = "${var.dd_apache == "enabled" ? 1 : 0 }" thresholds = { - ok = 1 - warning = 2 + ok = 1 + warning = 2 critical = 4 } diff --git a/middleware/nginx/monitors-nginx.tf b/middleware/nginx/monitors-nginx.tf index 8baf1cc..e09c010 100644 --- a/middleware/nginx/monitors-nginx.tf +++ b/middleware/nginx/monitors-nginx.tf @@ -2,14 +2,13 @@ resource "datadog_monitor" "Nginx_process" { name = "[${var.env}] Nginx process is down on {{host.name}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" - type = "service check" query = "\"process.up\".over(\"dd_monitoring:enabled\",\"dd_nginx:enabled\",\"process:nginx\",\"env:${var.env}\").by(\"host\",\"process\", \"app\").last(4).count_by_status()" count = "${var.dd_nginx == "enabled" ? 1 : 0 }" thresholds = { - ok = 1 - warning = 2 + ok = 1 + warning = 2 critical = 4 } diff --git a/middleware/php-fpm/monitors-fpm.tf b/middleware/php-fpm/monitors-fpm.tf index 844d6ac..97e7ff8 100644 --- a/middleware/php-fpm/monitors-fpm.tf +++ b/middleware/php-fpm/monitors-fpm.tf @@ -2,7 +2,6 @@ resource "datadog_monitor" "php-fpm_process_idle" { name = "[${var.env}] php_fpm busy worker > 90% on {{host.name}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" - type = "query alert" query = "avg(last_10m):avg:php_fpm.processes.active{dd_monitoring:enabled,dd_php_fpm:enabled,env:${var.env}} by {host,region} / ( avg:php_fpm.processes.idle{dd_monitoring:enabled,dd_php_fpm:enabled,env:${var.env}} by {host,region} + avg:php_fpm.processes.active{dd_monitoring:enabled,dd_php_fpm:enabled,env:${var.env}} by {host,region} ) > 0.9" count = "${var.dd_php_fpm == "enabled" ? 1 : 0 }" @@ -27,19 +26,17 @@ resource "datadog_monitor" "php-fpm_process_idle" { tags = ["*"] } - resource "datadog_monitor" "FPM_process" { name = "[${var.env}] FPM process is down on {{host.name}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" - type = "service check" query = "\"process.up\".over(\"dd_monitoring:enabled\",\"dd_php_fpm:enabled\",\"process:php_fpm\",\"env:${var.env}\").by(\"host\",\"process\", \"app\").last(4).count_by_status()" count = "${var.dd_nginx == "enabled" ? 1 : 0 }" thresholds = { - ok = 1 - warning = 2 + ok = 1 + warning = 2 critical = 4 } diff --git a/system/generic/monitors-custom-cpu.tf b/system/generic/monitors-custom-cpu.tf index c73d532..ef2d695 100644 --- a/system/generic/monitors-custom-cpu.tf +++ b/system/generic/monitors-custom-cpu.tf @@ -22,4 +22,3 @@ resource "datadog_monitor" "cpu_custom" { require_full_window = true no_data_timeframe = 20 } - diff --git a/system/linux/monitors-linux-basics.tf b/system/linux/monitors-linux-basics.tf index ee92cd5..73f7ed7 100644 --- a/system/linux/monitors-linux-basics.tf +++ b/system/linux/monitors-linux-basics.tf @@ -30,7 +30,7 @@ resource "datadog_monitor" "cpu_95_5min" { query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region} > ${var.cpu_5_critical}" type = "query alert" - count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" + count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" thresholds { critical = "${var.cpu_5_critical}" @@ -51,12 +51,12 @@ resource "datadog_monitor" "cpu_95_5min" { } resource "datadog_monitor" "datadog_free_disk_space_5" { - name = "[${var.env}] Free disk space < 5% on {{host.name}}" + name = "[${var.env}] Free disk space < 5% on {{host.name}}" message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device} * 100 < 5" type = "query alert" - count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" + count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" thresholds { critical = 5 @@ -164,22 +164,20 @@ resource "datadog_monitor" "datadog_free_disk_space_inodes_10" { # type = "query alert" # count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" - - # notify_no_data = "${var.linux_basics_config["notify_no_data"]}" - # evaluation_delay = "${var.linux_basics_config["delay"]}" - # new_host_delay = "${var.linux_basics_config["delay"]}" - # renotify_interval = 60 - # notify_audit = false - # timeout_h = 0 - # include_tags = true - # locked = false - # require_full_window = true - # no_data_timeframe = 20 +# notify_no_data = "${var.linux_basics_config["notify_no_data"]}" +# evaluation_delay = "${var.linux_basics_config["delay"]}" +# new_host_delay = "${var.linux_basics_config["delay"]}" +# renotify_interval = 60 +# notify_audit = false +# timeout_h = 0 +# include_tags = true +# locked = false +# require_full_window = true +# no_data_timeframe = 20 # } - resource "datadog_monitor" "datadog_free_memory" { - name = "[${var.env}] Free memory < 5% on {{host.name}}" + name = "[${var.env}] Free memory < 5% on {{host.name}}" message = "Debugging alert - no escalation" query = "sum(last_1m):avg:system.mem.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_memory:enabled} by {host,region} / avg:system.mem.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_memory:enabled} by {host,region} * 100 < 5" @@ -208,18 +206,21 @@ resource "datadog_monitor" "datadog_free_memory" { # name = "Host unreachable" # message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group} \n{{/is_recovery}}" + # query = "datadog.agent.up.over(dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled).last(1).count_by_status()" # type = "service check" # count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" - # notify_no_data = "${var.linux_basics_config["notify_no_data"]}" - # evaluation_delay = "${var.linux_basics_config["delay"]}" - # new_host_delay = "${var.linux_basics_config["delay"]}" - # renotify_interval = 60 - # notify_audit = false - # timeout_h = 0 - # include_tags = true - # locked = false - # require_full_window = true - # no_data_timeframe = 20 + +# notify_no_data = "${var.linux_basics_config["notify_no_data"]}" +# evaluation_delay = "${var.linux_basics_config["delay"]}" +# new_host_delay = "${var.linux_basics_config["delay"]}" +# renotify_interval = 60 +# notify_audit = false +# timeout_h = 0 +# include_tags = true +# locked = false +# require_full_window = true +# no_data_timeframe = 20 # } + diff --git a/testing/dummy/dummy.tf b/testing/dummy/dummy.tf index f5cc69c..171893e 100644 --- a/testing/dummy/dummy.tf +++ b/testing/dummy/dummy.tf @@ -1,3 +1 @@ - - -resource "null_resource" "test-migration" {} \ No newline at end of file +resource "null_resource" "test-migration" {} diff --git a/testing/inputs.tf b/testing/inputs.tf index 1f174cf..8180be3 100644 --- a/testing/inputs.tf +++ b/testing/inputs.tf @@ -5,6 +5,7 @@ variable region {} variable "critical_escalation_group" { default = "@pagerduty_HODummy" } + variable "warning_escalation_group" { default = "@pagerduty_HNODummy" } @@ -19,4 +20,3 @@ variable "dd_linux_basics" { variable "dd_aws_rds" { default = "enabled" } - diff --git a/testing/modules.tf b/testing/modules.tf index 258d85b..19106e1 100644 --- a/testing/modules.tf +++ b/testing/modules.tf @@ -1,14 +1,12 @@ module "datadog-monitors" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git" - env = "${var.environment}" - region = "${var.region}" + env = "${var.environment}" + region = "${var.region}" critical_escalation_group = "${var.critical_escalation_group}" warning_escalation_group = "${var.warning_escalation_group}" - - dd_aws_rds = "${var.dd_aws_rds}" - dd_linux_basics = "${var.dd_linux_basics}" - + dd_aws_rds = "${var.dd_aws_rds}" + dd_linux_basics = "${var.dd_linux_basics}" }