From e4e929ec1d6bb380da5eed54bd324ab33fc513ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 12:47:19 +0100 Subject: [PATCH 01/93] MON-78 Add datadog monitor for stream analytics --- cloud/azure/stream-analytics/inputs.tf | 44 +++++++++ .../monitors-stream-analytics.tf | 92 +++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 cloud/azure/stream-analytics/inputs.tf create mode 100644 cloud/azure/stream-analytics/monitors-stream-analytics.tf diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf new file mode 100644 index 0000000..e9bc507 --- /dev/null +++ b/cloud/azure/stream-analytics/inputs.tf @@ -0,0 +1,44 @@ +variable "hno_escalation_group" {} +variable "ho_escalation_group" {} + +variable "environment" {} + +variable "notify_no_data" { + default = "false" +} + +variable "delay" { + default = "600" +} + +variable "su_utilization_warning" { + default = 60 +} + +variable "su_utilization_critical" { + default = 80 +} + +variable "failed_function_requests_warning" { + default = 0 +} + +variable "failed_function_requests_critical" { + default = 10 +} + +variable "conversion_errors_warning" { + default = 0 +} + +variable "conversion_errors_critical" { + default = 10 +} + +variable "runtime_errors_warning" { + default = 0 +} + +variable "runtime_errors_critical" { + default = 0 +} diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf new file mode 100644 index 0000000..f18d7f1 --- /dev/null +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -0,0 +1,92 @@ +resource "datadog_monitor" "SU_utilization" { + name = "[${var.environment} SU utilization at more than ${var.su_utilization_critical}% on {{host.identifier}}]" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{*} by {name,resource_group} > ${var.su_utilization_critical}" + type = "query alert" + + notify_no_data = "${var.notify_no_data}" + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.su_utilization_warning}" + critical = "${var.su_utilization_critical}" + } +} + +resource "datadog_monitor" "failed_function_requests" { + name = "[${var.environment} More than ${var.failed_function_requests_critical} failed function requests on {{host.identifier}}]" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}" + type = "query alert" + + notify_no_data = "${var.notify_no_data}" + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.failed_function_requests_warning}" + critical = "${var.failed_function_requests_critical}" + } +} + +resource "datadog_monitor" "conversion_errors" { + name = "[${var.environment} More than ${var.conversion_errors_critical} conversion errors on {{host.identifier}}]" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}" + type = "query alert" + + notify_no_data = "${var.notify_no_data}" + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.conversion_errors_warning}" + critical = "${var.conversion_errors_critical}" + } +} + +resource "datadog_monitor" "runtime_errors" { + name = "[${var.environment} More than ${var.runtime_errors_critical} runtime errors on {{host.identifier}}]" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}" + type = "query alert" + + notify_no_data = "${var.notify_no_data}" + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.runtime_errors_warning}" + critical = "${var.runtime_errors_critical}" + } +} + From 17fa260daf594ab65043310053f0f534d49bff7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 13:08:17 +0100 Subject: [PATCH 02/93] MON-78 Corrected bad warning value for runtime_errors --- cloud/azure/stream-analytics/inputs.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index e9bc507..4ea5ee6 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -40,5 +40,5 @@ variable "runtime_errors_warning" { } variable "runtime_errors_critical" { - default = 0 + default = 10 } From daabb7244af225ccffc2580fbb2b441586163bba Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Mon, 30 Oct 2017 14:30:05 +0100 Subject: [PATCH 03/93] MON-80 Add inputs and monitors files --- cloud/azure/iothubs/inputs.tf | 36 ++++++++++ cloud/azure/iothubs/monitors-iothubs.tf | 90 +++++++++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 cloud/azure/iothubs/inputs.tf create mode 100644 cloud/azure/iothubs/monitors-iothubs.tf diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf new file mode 100644 index 0000000..ddc3456 --- /dev/null +++ b/cloud/azure/iothubs/inputs.tf @@ -0,0 +1,36 @@ +variable "hno_escalation_group" {} + +variable "ho_escalation_group" {} + +variable "environment" {} + +variable "subscription_id" {} + +## IOT hubs +variable "delay" { + default = 600 +} + +variable "warning_jobs_failed" { + default = 5 +} + +variable "critical_jobs_failed" { + default = 10 +} + +variable "warning_listjobs_failed" { + default = 5 +} + +variable "critical_listjobs_failed" { + default = 10 +} + +variable "warning_queryjobs_failed" { + default = 5 +} + +variable "critical_queryjobs_failed" { + default = 10 +} \ No newline at end of file diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf new file mode 100644 index 0000000..5f584db --- /dev/null +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -0,0 +1,90 @@ +resource "datadog_monitor" "too_many_jobs_failed" { + name = "[${var.environment}] Too many jobs failed on {{name}} " + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.critical_jobs_failed}" + type = "query alert" + + thresholds { + warning = "${var.warning_jobs_failed}" + critical = "${var.critical_jobs_failed}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_list_jobs_failed" { + name = "[${var.environment}] Too many list_jobs failure on {{name}} " + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_listjobs_failed}" + type = "query alert" + + thresholds { + warning = "${var.warning_listjobs_failed}" + critical = "${var.critical_listjobs_failed}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_query_jobs_failed" { + name = "[${var.environment}] Too many query_jobs failed on {{name}} " + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_queryjobs_failed}" + type = "query alert" + + thresholds { + warning = "${var.warning_queryjobs_failed}" + critical = "${var.critical_queryjobs_failed}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "status" { + name = "[${var.environment}] Status is not ok on {{name}} " + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1" + type = "query alert" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} \ No newline at end of file From 7f0a0e91cf6fdd3cb6ea5d33abcbf2dbdb41c0a2 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Mon, 30 Oct 2017 15:21:40 +0100 Subject: [PATCH 04/93] MON-80 Rename variable for message alerting --- cloud/azure/iothubs/inputs.tf | 8 +++++--- cloud/azure/iothubs/monitors-iothubs.tf | 8 ++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index ddc3456..5de7dab 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -1,10 +1,12 @@ -variable "hno_escalation_group" {} +variable "critical_escalation_group" {} -variable "ho_escalation_group" {} +variable "warning_escalation_group" {} variable "environment" {} -variable "subscription_id" {} +variable "stack" {} + +variable "client_name" {} ## IOT hubs variable "delay" { diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 5f584db..e333808 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -1,6 +1,6 @@ resource "datadog_monitor" "too_many_jobs_failed" { name = "[${var.environment}] Too many jobs failed on {{name}} " - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.critical_jobs_failed}" type = "query alert" @@ -24,7 +24,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { resource "datadog_monitor" "too_many_list_jobs_failed" { name = "[${var.environment}] Too many list_jobs failure on {{name}} " - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_listjobs_failed}" type = "query alert" @@ -48,7 +48,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { resource "datadog_monitor" "too_many_query_jobs_failed" { name = "[${var.environment}] Too many query_jobs failed on {{name}} " - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_queryjobs_failed}" type = "query alert" @@ -72,7 +72,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { resource "datadog_monitor" "status" { name = "[${var.environment}] Status is not ok on {{name}} " - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1" type = "query alert" From 4c474be541eb5f3a3f14bf2a8bd7716803651ecf Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Mon, 30 Oct 2017 16:42:58 +0100 Subject: [PATCH 05/93] MON-80 Add monitors and update variables --- cloud/azure/iothubs/inputs.tf | 44 +++++++++++----- cloud/azure/iothubs/monitors-iothubs.tf | 69 ++++++++++++++++++++----- 2 files changed, 86 insertions(+), 27 deletions(-) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 5de7dab..38b1b44 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -1,38 +1,54 @@ -variable "critical_escalation_group" {} - -variable "warning_escalation_group" {} - variable "environment" {} variable "stack" {} variable "client_name" {} -## IOT hubs variable "delay" { default = 600 } -variable "warning_jobs_failed" { - default = 5 +## IOT hubs +variable "jobs_failed_threshold_warning" { + default = 0 } -variable "critical_jobs_failed" { +variable "jobs_failed_threshold_critical" { default = 10 } -variable "warning_listjobs_failed" { - default = 5 +variable "jobs_failed_message" {} + +variable "listjobs_failed_threshold_warning" { + default = 0 } -variable "critical_listjobs_failed" { +variable "listjobs_failed_threshold_critical" { default = 10 } -variable "warning_queryjobs_failed" { - default = 5 +variable "listjobs_failed_message" {} + +variable "queryjobs_failed_threshold_warning" { + default = 0 } -variable "critical_queryjobs_failed" { +variable "queryjobs_failed_threshold_critical" { + default = 10 +} + +variable "queryjobs_failed_message" {} + +variable "status_message" {} + +variable "total_devices_message" {} + +variable "c2d_methods_failed_message" {} + +variable "c2d_methods_failed_threshold_warning" { + default = 0 +} + +variable "c2d_methods_failed_threshold_critical" { default = 10 } \ No newline at end of file diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index e333808..12f3d9a 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -1,13 +1,13 @@ resource "datadog_monitor" "too_many_jobs_failed" { name = "[${var.environment}] Too many jobs failed on {{name}} " - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.jobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.critical_jobs_failed}" + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" type = "query alert" thresholds { - warning = "${var.warning_jobs_failed}" - critical = "${var.critical_jobs_failed}" + warning = "${var.jobs_failed_threshold_warning}" + critical = "${var.jobs_failed_threshold_critical}" } notify_no_data = false @@ -24,14 +24,14 @@ resource "datadog_monitor" "too_many_jobs_failed" { resource "datadog_monitor" "too_many_list_jobs_failed" { name = "[${var.environment}] Too many list_jobs failure on {{name}} " - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.listjobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_listjobs_failed}" + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" type = "query alert" thresholds { - warning = "${var.warning_listjobs_failed}" - critical = "${var.critical_listjobs_failed}" + warning = "${var.listjobs_failed_threshold_warning}" + critical = "${var.listjobs_failed_threshold_critical}" } notify_no_data = false @@ -48,14 +48,14 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { resource "datadog_monitor" "too_many_query_jobs_failed" { name = "[${var.environment}] Too many query_jobs failed on {{name}} " - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.queryjobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_queryjobs_failed}" + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" type = "query alert" thresholds { - warning = "${var.warning_queryjobs_failed}" - critical = "${var.critical_queryjobs_failed}" + warning = "${var.queryjobs_failed_threshold_warning}" + critical = "${var.queryjobs_failed_threshold_warning}" } notify_no_data = false @@ -72,11 +72,54 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { resource "datadog_monitor" "status" { name = "[${var.environment}] Status is not ok on {{name}} " - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.status_message}" query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1" type = "query alert" + notify_no_data = true + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "total_devices" { + name = "[${var.environment}] Total devices is wrong on {{name}} " + message = "${var.total_devices_message}" + + query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{*} by {name,resource_group} == 0" + type = "query alert" + + notify_no_data = true + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_c2d_methods_failed" { + name = "[${var.environment}] Too many c2d methods failure on {{name}} " + message = "${var.c2d_methods_failed_message}" + + query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.c2d_methods_failed_threshold_warning}" + critical = "${var.c2d_methods_failed_threshold_critical}" + } + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 60 From effaaf0e12d6446510700e4bc71765c8a0b37441 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Mon, 30 Oct 2017 17:13:42 +0100 Subject: [PATCH 06/93] MON-80 Add c2d and d2c monitors --- cloud/azure/iothubs/inputs.tf | 46 +++++++++++- cloud/azure/iothubs/monitors-iothubs.tf | 98 ++++++++++++++++++++++++- 2 files changed, 140 insertions(+), 4 deletions(-) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 38b1b44..093b3a3 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -43,12 +43,52 @@ variable "status_message" {} variable "total_devices_message" {} -variable "c2d_methods_failed_message" {} - variable "c2d_methods_failed_threshold_warning" { default = 0 } variable "c2d_methods_failed_threshold_critical" { default = 10 -} \ No newline at end of file +} + +variable "c2d_methods_failed_message" {} + +variable "c2d_twin_read_failed_threshold_warning" { + default = 0 +} + +variable "c2d_twin_read_failed_threshold_critical" { + default = 10 +} + +variable "c2d_twin_read_failed_message" {} + +variable "c2d_twin_update_failed_threshold_warning" { + default = 0 +} + +variable "c2d_twin_update_failed_threshold_critical" { + default = 10 +} + +variable "c2d_twin_update_failed_message" {} + +variable "d2c_twin_read_failed_threshold_warning" { + default = 0 +} + +variable "d2c_twin_read_failed_threshold_critical" { + default = 10 +} + +variable "d2c_twin_read_failed_message" {} + +variable "d2c_twin_update_failed_threshold_warning" { + default = 0 +} + +variable "d2c_twin_update_failed_threshold_critical" { + default = 10 +} + +variable "d2c_twin_update_failed_message" {} \ No newline at end of file diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 12f3d9a..8d44dde 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -130,4 +130,100 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 -} \ No newline at end of file +} + +resource "datadog_monitor" "too_many_c2d_twin_read_failed" { + name = "[${var.environment}] Too many c2d twin read failure on {{name}} " + message = "${var.c2d_twin_read_failed_message}" + + query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.c2d_twin_read_failed_threshold_warning}" + critical = "${var.c2d_twin_read_failed_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_c2d_twin_update_failed" { + name = "[${var.environment}] Too many c2d twin update failure on {{name}} " + message = "${var.c2d_twin_update_failed_message}" + + query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.c2d_twin_update_failed_threshold_warning}" + critical = "${var.c2d_twin_update_failed_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_d2c_twin_read_failed" { + name = "[${var.environment}] Too many d2c twin read failure on {{name}} " + message = "${var.d2c_twin_read_failed_message}" + + query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.d2c_twin_read_failed_threshold_warning}" + critical = "${var.d2c_twin_read_failed_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_d2c_twin_update_failed" { + name = "[${var.environment}] Too many d2c twin update failure on {{name}} " + message = "${var.d2c_twin_update_failed_message}" + + query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.d2c_twin_update_failed_threshold_warning}" + critical = "${var.d2c_twin_update_failed_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} From 5136dd5c4d1e5b3bea7685b95f8361d21a02dc34 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Mon, 30 Oct 2017 17:30:16 +0100 Subject: [PATCH 07/93] MON-80 Add subscription_id --- cloud/azure/iothubs/inputs.tf | 2 ++ cloud/azure/iothubs/monitors-iothubs.tf | 40 ++++++++++++------------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 093b3a3..e705d8f 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -4,6 +4,8 @@ variable "stack" {} variable "client_name" {} +variable "subscription_id" {} + variable "delay" { default = 600 } diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 8d44dde..a4ec018 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -2,7 +2,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { name = "[${var.environment}] Too many jobs failed on {{name}} " message = "${var.jobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" type = "query alert" thresholds { @@ -12,7 +12,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -26,7 +26,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { name = "[${var.environment}] Too many list_jobs failure on {{name}} " message = "${var.listjobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" type = "query alert" thresholds { @@ -36,7 +36,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -50,7 +50,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { name = "[${var.environment}] Too many query_jobs failed on {{name}} " message = "${var.queryjobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" type = "query alert" thresholds { @@ -60,7 +60,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -74,12 +74,12 @@ resource "datadog_monitor" "status" { name = "[${var.environment}] Status is not ok on {{name}} " message = "${var.status_message}" - query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1" + query = "avg(last_5m):avg:azure.devices_iothubs.status{subscription_id:${var.subscription_id}} by {name,resource_group} < 1" type = "query alert" notify_no_data = true evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -93,12 +93,12 @@ resource "datadog_monitor" "total_devices" { name = "[${var.environment}] Total devices is wrong on {{name}} " message = "${var.total_devices_message}" - query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{*} by {name,resource_group} == 0" + query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{subscription_id:${var.subscription_id}} by {name,resource_group} == 0" type = "query alert" notify_no_data = true evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -112,7 +112,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { name = "[${var.environment}] Too many c2d methods failure on {{name}} " message = "${var.c2d_methods_failed_message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}" type = "query alert" thresholds { @@ -122,7 +122,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -136,7 +136,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { name = "[${var.environment}] Too many c2d twin read failure on {{name}} " message = "${var.c2d_twin_read_failed_message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}" type = "query alert" thresholds { @@ -146,7 +146,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -160,7 +160,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { name = "[${var.environment}] Too many c2d twin update failure on {{name}} " message = "${var.c2d_twin_update_failed_message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}" type = "query alert" thresholds { @@ -170,7 +170,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -184,7 +184,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { name = "[${var.environment}] Too many d2c twin read failure on {{name}} " message = "${var.d2c_twin_read_failed_message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}" type = "query alert" thresholds { @@ -194,7 +194,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -208,7 +208,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { name = "[${var.environment}] Too many d2c twin update failure on {{name}} " message = "${var.d2c_twin_update_failed_message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}" type = "query alert" thresholds { @@ -218,7 +218,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true From 193352c212277fac91d51ad336b704f7cde8d54c Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Mon, 30 Oct 2017 18:09:03 +0100 Subject: [PATCH 08/93] MON-80 Add IOT Hub in Names --- cloud/azure/iothubs/monitors-iothubs.tf | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index a4ec018..f111897 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -1,5 +1,5 @@ resource "datadog_monitor" "too_many_jobs_failed" { - name = "[${var.environment}] Too many jobs failed on {{name}} " + name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}" message = "${var.jobs_failed_message}" query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" @@ -23,7 +23,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { } resource "datadog_monitor" "too_many_list_jobs_failed" { - name = "[${var.environment}] Too many list_jobs failure on {{name}} " + name = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}" message = "${var.listjobs_failed_message}" query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" @@ -47,7 +47,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { } resource "datadog_monitor" "too_many_query_jobs_failed" { - name = "[${var.environment}] Too many query_jobs failed on {{name}} " + name = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}" message = "${var.queryjobs_failed_message}" query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" @@ -55,7 +55,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { thresholds { warning = "${var.queryjobs_failed_threshold_warning}" - critical = "${var.queryjobs_failed_threshold_warning}" + critical = "${var.queryjobs_failed_threshold_critical}" } notify_no_data = false @@ -71,7 +71,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { } resource "datadog_monitor" "status" { - name = "[${var.environment}] Status is not ok on {{name}} " + name = "[${var.environment}] IOT Hub Status is not ok on {{name}}" message = "${var.status_message}" query = "avg(last_5m):avg:azure.devices_iothubs.status{subscription_id:${var.subscription_id}} by {name,resource_group} < 1" @@ -90,7 +90,7 @@ resource "datadog_monitor" "status" { } resource "datadog_monitor" "total_devices" { - name = "[${var.environment}] Total devices is wrong on {{name}} " + name = "[${var.environment}] IOT Hub Total devices is wrong on {{name}}" message = "${var.total_devices_message}" query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{subscription_id:${var.subscription_id}} by {name,resource_group} == 0" @@ -109,7 +109,7 @@ resource "datadog_monitor" "total_devices" { } resource "datadog_monitor" "too_many_c2d_methods_failed" { - name = "[${var.environment}] Too many c2d methods failure on {{name}} " + name = "[${var.environment}] IOT Hub Too many c2d methods failure on {{name}}" message = "${var.c2d_methods_failed_message}" query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}" @@ -133,7 +133,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { } resource "datadog_monitor" "too_many_c2d_twin_read_failed" { - name = "[${var.environment}] Too many c2d twin read failure on {{name}} " + name = "[${var.environment}] IOT Hub Too many c2d twin read failure on {{name}}" message = "${var.c2d_twin_read_failed_message}" query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}" @@ -157,7 +157,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { } resource "datadog_monitor" "too_many_c2d_twin_update_failed" { - name = "[${var.environment}] Too many c2d twin update failure on {{name}} " + name = "[${var.environment}] IOT Hub Too many c2d twin update failure on {{name}}" message = "${var.c2d_twin_update_failed_message}" query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}" @@ -181,7 +181,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { } resource "datadog_monitor" "too_many_d2c_twin_read_failed" { - name = "[${var.environment}] Too many d2c twin read failure on {{name}} " + name = "[${var.environment}] IOT Hub Too many d2c twin read failure on {{name}}" message = "${var.d2c_twin_read_failed_message}" query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}" @@ -205,7 +205,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { } resource "datadog_monitor" "too_many_d2c_twin_update_failed" { - name = "[${var.environment}] Too many d2c twin update failure on {{name}} " + name = "[${var.environment}] IOT Hub Too many d2c twin update failure on {{name}}" message = "${var.d2c_twin_update_failed_message}" query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}" From 113d4aabd25fa1172dea821ef6b6f7688011f960 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 31 Oct 2017 11:12:26 +0100 Subject: [PATCH 09/93] MON-80 Add monitors for telemetry --- cloud/azure/iothubs/inputs.tf | 44 ++++++++- cloud/azure/iothubs/monitors-iothubs.tf | 121 +++++++++++++++++++++++- 2 files changed, 161 insertions(+), 4 deletions(-) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index e705d8f..5ae0587 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -93,4 +93,46 @@ variable "d2c_twin_update_failed_threshold_critical" { default = 10 } -variable "d2c_twin_update_failed_message" {} \ No newline at end of file +variable "d2c_twin_update_failed_message" {} + +variable "d2c_telemetry_egress_dropped_threshold_warning" { + default = 500 +} + +variable "d2c_telemetry_egress_dropped_threshold_critical" { + default = 1000 +} + +variable "d2c_telemetry_egress_dropped_message" {} + +variable "d2c_telemetry_egress_orphaned_threshold_warning" { + default = 500 +} + +variable "d2c_telemetry_egress_orphaned_threshold_critical" { + default = 1000 +} + +variable "d2c_telemetry_egress_orphaned_message" {} + +variable "d2c_telemetry_egress_invalid_threshold_warning" { + default = 500 +} + +variable "d2c_telemetry_egress_invalid_threshold_critical" { + default = 1000 +} + +variable "d2c_telemetry_egress_invalid_message" {} + +variable "d2c_telemetry_egress_fallback_threshold_warning" { + default = 500 +} + +variable "d2c_telemetry_egress_fallback_threshold_critical" { + default = 1000 +} + +variable "d2c_telemetry_egress_fallback_message" {} + +variable "d2c_telemetry_ingress_nosent_message" {} diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index f111897..4c59099 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -2,7 +2,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}" message = "${var.jobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" type = "query alert" thresholds { @@ -26,7 +26,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { name = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}" message = "${var.listjobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" type = "query alert" thresholds { @@ -50,7 +50,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { name = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}" message = "${var.queryjobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" type = "query alert" thresholds { @@ -227,3 +227,118 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 } + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped on {{name}}" + message = "${var.d2c_telemetry_egress_dropped_message}" + + query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_dropped_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.d2c_telemetry_egress_dropped_threshold_warning}" + critical = "${var.d2c_telemetry_egress_dropped_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned on {{name}}" + message = "${var.d2c_telemetry_egress_orphaned_message}" + + query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_orphaned_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.d2c_telemetry_egress_orphaned_threshold_warning}" + critical = "${var.d2c_telemetry_egress_orphaned_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid on {{name}}" + message = "${var.d2c_telemetry_egress_invalid_message}" + + query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_invalid_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.d2c_telemetry_egress_invalid_threshold_warning}" + critical = "${var.d2c_telemetry_egress_invalid_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress fallback on {{name}}" + message = "${var.d2c_telemetry_egress_fallback_message}" + + query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_fallback_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.d2c_telemetry_egress_fallback_threshold_warning}" + critical = "${var.d2c_telemetry_egress_fallback_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress no sent on {{name}}" + message = "${var.d2c_telemetry_ingress_nosent_message}" + + query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() - avg:azure.devices_iothubs.d2c.telemetry.ingress.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() > 0" + type = "query alert" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} From 34ef735a076884ef27474431a0df695b9228858e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 14:18:15 +0100 Subject: [PATCH 10/93] MON-78: Changed host.identifier for name to identify the streamanalytics obkect with issues --- .../azure/stream-analytics/monitors-stream-analytics.tf | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index f18d7f1..ea2920f 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -1,5 +1,5 @@ resource "datadog_monitor" "SU_utilization" { - name = "[${var.environment} SU utilization at more than ${var.su_utilization_critical}% on {{host.identifier}}]" + name = "[${var.environment}] SU utilization at more than ${var.su_utilization_critical}% on {{name}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{*} by {name,resource_group} > ${var.su_utilization_critical}" @@ -22,7 +22,7 @@ resource "datadog_monitor" "SU_utilization" { } resource "datadog_monitor" "failed_function_requests" { - name = "[${var.environment} More than ${var.failed_function_requests_critical} failed function requests on {{host.identifier}}]" + name = "[${var.environment}] More than ${var.failed_function_requests_critical} failed function requests on {{name}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}" @@ -45,7 +45,7 @@ resource "datadog_monitor" "failed_function_requests" { } resource "datadog_monitor" "conversion_errors" { - name = "[${var.environment} More than ${var.conversion_errors_critical} conversion errors on {{host.identifier}}]" + name = "[${var.environment}] More than ${var.conversion_errors_critical} conversion errors on {{name}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}" @@ -68,7 +68,7 @@ resource "datadog_monitor" "conversion_errors" { } resource "datadog_monitor" "runtime_errors" { - name = "[${var.environment} More than ${var.runtime_errors_critical} runtime errors on {{host.identifier}}]" + name = "[${var.environment}] More than ${var.runtime_errors_critical} runtime errors on {{name}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}" @@ -89,4 +89,3 @@ resource "datadog_monitor" "runtime_errors" { critical = "${var.runtime_errors_critical}" } } - From cf3309ce753a146901fce7d1bcb4871639f8d410 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 31 Oct 2017 11:47:37 +0100 Subject: [PATCH 11/93] MON-80 Add README.md --- cloud/azure/iothubs/README.md | 109 ++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 cloud/azure/iothubs/README.md diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md new file mode 100644 index 0000000..d53bf2b --- /dev/null +++ b/cloud/azure/iothubs/README.md @@ -0,0 +1,109 @@ +Azure Redis DataDog monitors +============================ + +How to use this module +---------------------- + +``` +module "iothubs" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors" + + jobs_failed_message = "${module.datadog-message-alerting.alerting-message}" + listjobs_failed_message = "${module.datadog-message-alerting.alerting-message}" + queryjobs_failed_message = "${module.datadog-message-alerting.alerting-message}" + status_message = "${module.datadog-message-alerting.alerting-message}" + total_devices_message = "${module.datadog-message-alerting.alerting-message}" + c2d_methods_failed_message = "${module.datadog-message-alerting.alerting-message}" + c2d_twin_read_failed_message = "${module.datadog-message-alerting.alerting-message}" + c2d_twin_update_failed_message = "${module.datadog-message-alerting.alerting-message}" + d2c_twin_read_failed_message = "${module.datadog-message-alerting.alerting-message}" + d2c_twin_update_failed_message = "${module.datadog-message-alerting.alerting-message}" + d2c_telemetry_egress_dropped_message = "${module.datadog-message-alerting.alerting-message}" + d2c_telemetry_egress_orphaned_message = "${module.datadog-message-alerting.alerting-message}" + d2c_telemetry_egress_invalid_message = "${module.datadog-message-alerting.alerting-message}" + d2c_telemetry_egress_fallback_message = "${module.datadog-message-alerting.alerting-message}" + d2c_telemetry_ingress_nosent_message = "${module.datadog-message-alerting.alerting-message}" + + environment = "${var.environment}" + stack = "${var.stack}" + client_name = "${var.client_name}" + subscription_id = "${var.subscription_id}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* Service status check +* Jobs failed average check +* Query Jobs failed average check +* List Jobs failed average check +* Total devices count check +* C2D methods failed average check +* C2D twin read failed average check +* C2D twin update failed average check +* D2C twin read failed average check +* D2C twin update failed average check +* D2C telemetry egress dropped count check +* D2C telemetry egress orphaned count check +* D2C telemetry egress invalid count check +* D2C telemetry egress fallback count check +* D2C telemetry ingress no sent count check + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| c2d_methods_failed_message | | string | - | yes | +| c2d_methods_failed_threshold_critical | | string | `10` | no | +| c2d_methods_failed_threshold_warning | | string | `0` | no | +| c2d_twin_read_failed_message | | string | - | yes | +| c2d_twin_read_failed_threshold_critical | | string | `10` | no | +| c2d_twin_read_failed_threshold_warning | | string | `0` | no | +| c2d_twin_update_failed_message | | string | - | yes | +| c2d_twin_update_failed_threshold_critical | | string | `10` | no | +| c2d_twin_update_failed_threshold_warning | | string | `0` | no | +| client_name | | string | - | yes | +| d2c_telemetry_egress_dropped_message | | string | - | yes | +| d2c_telemetry_egress_dropped_threshold_critical | | string | `1000` | no | +| d2c_telemetry_egress_dropped_threshold_warning | | string | `500` | no | +| d2c_telemetry_egress_fallback_message | | string | - | yes | +| d2c_telemetry_egress_fallback_threshold_critical | | string | `1000` | no | +| d2c_telemetry_egress_fallback_threshold_warning | | string | `500` | no | +| d2c_telemetry_egress_invalid_message | | string | - | yes | +| d2c_telemetry_egress_invalid_threshold_critical | | string | `1000` | no | +| d2c_telemetry_egress_invalid_threshold_warning | | string | `500` | no | +| d2c_telemetry_egress_orphaned_message | | string | - | yes | +| d2c_telemetry_egress_orphaned_threshold_critical | | string | `1000` | no | +| d2c_telemetry_egress_orphaned_threshold_warning | | string | `500` | no | +| d2c_telemetry_ingress_nosent_message | | string | - | yes | +| d2c_twin_read_failed_message | | string | - | yes | +| d2c_twin_read_failed_threshold_critical | | string | `10` | no | +| d2c_twin_read_failed_threshold_warning | | string | `0` | no | +| d2c_twin_update_failed_message | | string | - | yes | +| d2c_twin_update_failed_threshold_critical | | string | `10` | no | +| d2c_twin_update_failed_threshold_warning | | string | `0` | no | +| delay | | string | `600` | no | +| environment | | string | - | yes | +| jobs_failed_message | | string | - | yes | +| jobs_failed_threshold_critical | | string | `10` | no | +| jobs_failed_threshold_warning | # IOT hubs | string | `0` | no | +| listjobs_failed_message | | string | - | yes | +| listjobs_failed_threshold_critical | | string | `10` | no | +| listjobs_failed_threshold_warning | | string | `0` | no | +| queryjobs_failed_message | | string | - | yes | +| queryjobs_failed_threshold_critical | | string | `10` | no | +| queryjobs_failed_threshold_warning | | string | `0` | no | +| stack | | string | - | yes | +| status_message | | string | - | yes | +| subscription_id | | string | - | yes | +| total_devices_message | | string | - | yes | + +Related documentation +--------------------- + +DataDog documentation: https://docs.datadoghq.com/integrations/azure_iot_hub/ + +Azure IOT Hubs metrics documentation: https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health \ No newline at end of file From 51b3b5010da96533a605c94f2d9e6d44ea05f495 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 15:22:52 +0100 Subject: [PATCH 12/93] MON-78 Changed variable names --- cloud/azure/stream-analytics/inputs.tf | 8 ++++++-- .../stream-analytics/monitors-stream-analytics.tf | 10 +++++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index 4ea5ee6..529e669 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -1,5 +1,5 @@ -variable "hno_escalation_group" {} -variable "ho_escalation_group" {} +variable "critical_escalation_group" {} +variable "warning_escalation_group" {} variable "environment" {} @@ -7,6 +7,10 @@ variable "notify_no_data" { default = "false" } +variable "filter_tags" { + default = "*" +} + variable "delay" { default = "600" } diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index ea2920f..4e64044 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -1,8 +1,8 @@ resource "datadog_monitor" "SU_utilization" { name = "[${var.environment}] SU utilization at more than ${var.su_utilization_critical}% on {{name}}" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{*} by {name,resource_group} > ${var.su_utilization_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -23,7 +23,7 @@ resource "datadog_monitor" "SU_utilization" { resource "datadog_monitor" "failed_function_requests" { name = "[${var.environment}] More than ${var.failed_function_requests_critical} failed function requests on {{name}}" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}" type = "query alert" @@ -46,7 +46,7 @@ resource "datadog_monitor" "failed_function_requests" { resource "datadog_monitor" "conversion_errors" { name = "[${var.environment}] More than ${var.conversion_errors_critical} conversion errors on {{name}}" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}" type = "query alert" @@ -69,7 +69,7 @@ resource "datadog_monitor" "conversion_errors" { resource "datadog_monitor" "runtime_errors" { name = "[${var.environment}] More than ${var.runtime_errors_critical} runtime errors on {{name}}" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}" type = "query alert" From 54a90b3972a2a2a374f5a5726350f38ad2fdf52d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 15:34:57 +0100 Subject: [PATCH 13/93] MON-78 Removed upper case resource name --- cloud/azure/stream-analytics/monitors-stream-analytics.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 4e64044..68043f8 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -1,4 +1,4 @@ -resource "datadog_monitor" "SU_utilization" { +resource "datadog_monitor" "su_utilization" { name = "[${var.environment}] SU utilization at more than ${var.su_utilization_critical}% on {{name}}" message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" From 9261bde1588268650f9f1295489daf756257ff8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 15:51:00 +0100 Subject: [PATCH 14/93] MON-78: Remove escalation variables, add message variable --- cloud/azure/stream-analytics/inputs.tf | 3 +-- cloud/azure/stream-analytics/monitors-stream-analytics.tf | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index 529e669..d240169 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -1,5 +1,4 @@ -variable "critical_escalation_group" {} -variable "warning_escalation_group" {} +variable "message" {} variable "environment" {} diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 68043f8..6cf42c5 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -1,6 +1,6 @@ resource "datadog_monitor" "su_utilization" { name = "[${var.environment}] SU utilization at more than ${var.su_utilization_critical}% on {{name}}" - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.message}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_critical}" type = "query alert" @@ -23,7 +23,7 @@ resource "datadog_monitor" "su_utilization" { resource "datadog_monitor" "failed_function_requests" { name = "[${var.environment}] More than ${var.failed_function_requests_critical} failed function requests on {{name}}" - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.message}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}" type = "query alert" @@ -46,7 +46,7 @@ resource "datadog_monitor" "failed_function_requests" { resource "datadog_monitor" "conversion_errors" { name = "[${var.environment}] More than ${var.conversion_errors_critical} conversion errors on {{name}}" - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.message}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}" type = "query alert" @@ -69,7 +69,7 @@ resource "datadog_monitor" "conversion_errors" { resource "datadog_monitor" "runtime_errors" { name = "[${var.environment}] More than ${var.runtime_errors_critical} runtime errors on {{name}}" - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.message}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}" type = "query alert" From 0b03cade41951578a3f6363b0733d31eee4e93e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 16:35:44 +0100 Subject: [PATCH 15/93] MON-78 Changing naming convention for variables --- cloud/azure/stream-analytics/inputs.tf | 16 +++++----- .../monitors-stream-analytics.tf | 32 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index d240169..2d0619a 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -14,34 +14,34 @@ variable "delay" { default = "600" } -variable "su_utilization_warning" { +variable "su_utilization_threshold_warning" { default = 60 } -variable "su_utilization_critical" { +variable "su_utilization_threshold_critical" { default = 80 } -variable "failed_function_requests_warning" { +variable "function_requests_threshold_warning" { default = 0 } -variable "failed_function_requests_critical" { +variable "function_requests_threshold_critical" { default = 10 } -variable "conversion_errors_warning" { +variable "conversion_errors_threshold_warning" { default = 0 } -variable "conversion_errors_critical" { +variable "conversion_errors_threshold_critical" { default = 10 } -variable "runtime_errors_warning" { +variable "runtime_errors_threshold_warning" { default = 0 } -variable "runtime_errors_critical" { +variable "runtime_errors_threshold_critical" { default = 10 } diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 6cf42c5..55ac674 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -1,8 +1,8 @@ resource "datadog_monitor" "su_utilization" { - name = "[${var.environment}] SU utilization at more than ${var.su_utilization_critical}% on {{name}}" + name = "[${var.environment}] SU utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -16,16 +16,16 @@ resource "datadog_monitor" "su_utilization" { new_host_delay = "${var.delay}" no_data_timeframe = 20 thresholds { - warning = "${var.su_utilization_warning}" - critical = "${var.su_utilization_critical}" + warning = "${var.su_utilization_threshold_warning}" + critical = "${var.su_utilization_threshold_critical}" } } resource "datadog_monitor" "failed_function_requests" { - name = "[${var.environment}] More than ${var.failed_function_requests_critical} failed function requests on {{name}}" + name = "[${var.environment}] More than ${var.function_requests_threshold_critical} failed function requests on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.function_requests_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -39,16 +39,16 @@ resource "datadog_monitor" "failed_function_requests" { new_host_delay = "${var.delay}" no_data_timeframe = 20 thresholds { - warning = "${var.failed_function_requests_warning}" - critical = "${var.failed_function_requests_critical}" + warning = "${var.function_requests_threshold_warning}" + critical = "${var.function_requests_threshold_critical}" } } resource "datadog_monitor" "conversion_errors" { - name = "[${var.environment}] More than ${var.conversion_errors_critical} conversion errors on {{name}}" + name = "[${var.environment}] More than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -62,16 +62,16 @@ resource "datadog_monitor" "conversion_errors" { new_host_delay = "${var.delay}" no_data_timeframe = 20 thresholds { - warning = "${var.conversion_errors_warning}" - critical = "${var.conversion_errors_critical}" + warning = "${var.conversion_errors_threshold_warning}" + critical = "${var.conversion_errors_threshold_critical}" } } resource "datadog_monitor" "runtime_errors" { - name = "[${var.environment}] More than ${var.runtime_errors_critical} runtime errors on {{name}}" + name = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -85,7 +85,7 @@ resource "datadog_monitor" "runtime_errors" { new_host_delay = "${var.delay}" no_data_timeframe = 20 thresholds { - warning = "${var.runtime_errors_warning}" - critical = "${var.runtime_errors_critical}" + warning = "${var.runtime_errors_threshold_warning}" + critical = "${var.runtime_errors_threshold_critical}" } } From 0706a50badd6a4b442fe3afa6ab82712197572b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 17:01:21 +0100 Subject: [PATCH 16/93] MON-78: Changed monitor name for better clarity --- cloud/azure/stream-analytics/monitors-stream-analytics.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 55ac674..ed4c51f 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -1,5 +1,5 @@ resource "datadog_monitor" "su_utilization" { - name = "[${var.environment}] SU utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" + name = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" message = "${var.message}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_threshold_critical}" From 1f059622ed932ee209847dff647d30abc19ebdd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 17:46:21 +0100 Subject: [PATCH 17/93] MON-78 Changed filter to reach proper resources --- cloud/azure/stream-analytics/inputs.tf | 4 ++-- cloud/azure/stream-analytics/monitors-stream-analytics.tf | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index 2d0619a..1c3ff2e 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -6,8 +6,8 @@ variable "notify_no_data" { default = "false" } -variable "filter_tags" { - default = "*" +variable "use_filter_tags" { + default = "true" } variable "delay" { diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index ed4c51f..6903b6a 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -2,7 +2,7 @@ resource "datadog_monitor" "su_utilization" { name = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"} by {name,resource_group} > ${var.su_utilization_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -25,7 +25,7 @@ resource "datadog_monitor" "failed_function_requests" { name = "[${var.environment}] More than ${var.function_requests_threshold_critical} failed function requests on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.function_requests_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.function_requests_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -48,7 +48,7 @@ resource "datadog_monitor" "conversion_errors" { name = "[${var.environment}] More than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_stream_analytics:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.conversion_errors_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -71,7 +71,7 @@ resource "datadog_monitor" "runtime_errors" { name = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"} by {name,resource_group}} > ${var.runtime_errors_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" From aaabb129b5ae66cc9b2e2f940bac0fc7e9f8ee91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 17:52:40 +0100 Subject: [PATCH 18/93] MON-78 Forgot a } --- cloud/azure/stream-analytics/monitors-stream-analytics.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 6903b6a..e95825e 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -2,7 +2,7 @@ resource "datadog_monitor" "su_utilization" { name = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"} by {name,resource_group} > ${var.su_utilization_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.su_utilization_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" From 686765bcaa83e795f9608aad0f39c681e589477c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 18:00:06 +0100 Subject: [PATCH 19/93] MON-78 Corrected typo in query for runtime_errors --- cloud/azure/stream-analytics/monitors-stream-analytics.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index e95825e..6ca7717 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -71,7 +71,7 @@ resource "datadog_monitor" "runtime_errors" { name = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"} by {name,resource_group}} > ${var.runtime_errors_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.runtime_errors_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" From f916fbfc81ffdfe273eafc6bcab98432faf1b0f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 12:00:56 +0100 Subject: [PATCH 20/93] MON-78: Readme --- cloud/azure/stream-analytics/README.md | 39 ++++++++++++++++++++++++++ cloud/azure/stream-analytics/inputs.tf | 24 +++++++++++----- 2 files changed, 56 insertions(+), 7 deletions(-) create mode 100644 cloud/azure/stream-analytics/README.md diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md new file mode 100644 index 0000000..83d0af4 --- /dev/null +++ b/cloud/azure/stream-analytics/README.md @@ -0,0 +1,39 @@ +Azure Stream Analytics DataDog monitors +======================================= + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-redis" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/stream-analytics?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + + environment = "${var.environment}" +} +``` + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| conversion_errors_threshold_critical | | string | `10` | no | +| conversion_errors_threshold_warning | | string | `0` | no | +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| environment | Architecture environment | string | - | yes | +| function_requests_threshold_critical | | string | `10` | no | +| function_requests_threshold_warning | | string | `0` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| notify_no_data | | string | `false` | no | +| runtime_errors_threshold_critical | | string | `10` | no | +| runtime_errors_threshold_warning | | string | `0` | no | +| su_utilization_threshold_critical | | string | `80` | no | +| su_utilization_threshold_warning | Monitor specific | string | `60` | no | +| use_filter_tags | Filter the data with service tags if true | string | `true` | no | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/) diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index 1c3ff2e..29db469 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -1,19 +1,29 @@ -variable "message" {} +# Global Terraform +variable "environment" { + description = "Architecture environment" + type = "string" +} -variable "environment" {} +variable "message" { + description = "Message sent when a monitor is triggered" +} + +# Global DataDog +variable "use_filter_tags" { + description = "Filter the data with service tags if true" + default = "true" +} variable "notify_no_data" { default = "false" } -variable "use_filter_tags" { - default = "true" -} - variable "delay" { - default = "600" + description = "Delay in seconds for the metric evaluation" + default = 600 } +# Monitor specific variable "su_utilization_threshold_warning" { default = 60 } From 1a278fc81c90e853c0493132cd4f3e3f89858334 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 12:04:35 +0100 Subject: [PATCH 21/93] MON-78: Fixup use filter tag usage --- .../monitors-stream-analytics.tf | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 6ca7717..0972bd4 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -1,8 +1,16 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_stream_analytics:enabled,env:%s", var.environment) : "*"}" + } +} + resource "datadog_monitor" "su_utilization" { name = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.su_utilization_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.su_utilization_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -25,7 +33,7 @@ resource "datadog_monitor" "failed_function_requests" { name = "[${var.environment}] More than ${var.function_requests_threshold_critical} failed function requests on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.function_requests_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.function_requests_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -48,7 +56,7 @@ resource "datadog_monitor" "conversion_errors" { name = "[${var.environment}] More than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_stream_analytics:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.conversion_errors_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.conversion_errors_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -71,7 +79,7 @@ resource "datadog_monitor" "runtime_errors" { name = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.runtime_errors_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.runtime_errors_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" From 41997c9afe58583177acb7915036c5cd8cbdd910 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 31 Oct 2017 14:41:14 +0100 Subject: [PATCH 22/93] MON-78 Add EOF on querys --- .../monitors-stream-analytics.tf | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 0972bd4..8824410 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -10,7 +10,11 @@ resource "datadog_monitor" "su_utilization" { name = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.su_utilization_threshold_critical}" + query = < ${var.su_utilization_threshold_critical} + EOF type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -33,7 +37,11 @@ resource "datadog_monitor" "failed_function_requests" { name = "[${var.environment}] More than ${var.function_requests_threshold_critical} failed function requests on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.function_requests_threshold_critical}" + query = < ${var.function_requests_threshold_critical} + EOF type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -56,7 +64,11 @@ resource "datadog_monitor" "conversion_errors" { name = "[${var.environment}] More than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.conversion_errors_threshold_critical}" + query = < ${var.conversion_errors_threshold_critical} + EOF type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -79,7 +91,11 @@ resource "datadog_monitor" "runtime_errors" { name = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.runtime_errors_threshold_critical}" + query = < ${var.runtime_errors_threshold_critical} + EOF type = "query alert" notify_no_data = "${var.notify_no_data}" From c1563c331898b4ca8b2b08792e27d35e94affed2 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 31 Oct 2017 14:25:24 +0100 Subject: [PATCH 23/93] MON-80 use only one message and add inputs descriptions --- cloud/azure/iothubs/README.md | 97 ++++++----------- cloud/azure/iothubs/inputs.tf | 76 +++++++------- cloud/azure/iothubs/monitors-iothubs.tf | 133 ++++++++++++++++++------ 3 files changed, 178 insertions(+), 128 deletions(-) diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md index d53bf2b..3d6bb91 100644 --- a/cloud/azure/iothubs/README.md +++ b/cloud/azure/iothubs/README.md @@ -1,4 +1,4 @@ -Azure Redis DataDog monitors +Azure IOT Hubs DataDog monitors ============================ How to use this module @@ -8,22 +8,8 @@ How to use this module module "iothubs" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors" - jobs_failed_message = "${module.datadog-message-alerting.alerting-message}" - listjobs_failed_message = "${module.datadog-message-alerting.alerting-message}" - queryjobs_failed_message = "${module.datadog-message-alerting.alerting-message}" - status_message = "${module.datadog-message-alerting.alerting-message}" - total_devices_message = "${module.datadog-message-alerting.alerting-message}" - c2d_methods_failed_message = "${module.datadog-message-alerting.alerting-message}" - c2d_twin_read_failed_message = "${module.datadog-message-alerting.alerting-message}" - c2d_twin_update_failed_message = "${module.datadog-message-alerting.alerting-message}" - d2c_twin_read_failed_message = "${module.datadog-message-alerting.alerting-message}" - d2c_twin_update_failed_message = "${module.datadog-message-alerting.alerting-message}" - d2c_telemetry_egress_dropped_message = "${module.datadog-message-alerting.alerting-message}" - d2c_telemetry_egress_orphaned_message = "${module.datadog-message-alerting.alerting-message}" - d2c_telemetry_egress_invalid_message = "${module.datadog-message-alerting.alerting-message}" - d2c_telemetry_egress_fallback_message = "${module.datadog-message-alerting.alerting-message}" - d2c_telemetry_ingress_nosent_message = "${module.datadog-message-alerting.alerting-message}" - + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" stack = "${var.stack}" client_name = "${var.client_name}" @@ -56,54 +42,39 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| c2d_methods_failed_message | | string | - | yes | -| c2d_methods_failed_threshold_critical | | string | `10` | no | -| c2d_methods_failed_threshold_warning | | string | `0` | no | -| c2d_twin_read_failed_message | | string | - | yes | -| c2d_twin_read_failed_threshold_critical | | string | `10` | no | -| c2d_twin_read_failed_threshold_warning | | string | `0` | no | -| c2d_twin_update_failed_message | | string | - | yes | -| c2d_twin_update_failed_threshold_critical | | string | `10` | no | -| c2d_twin_update_failed_threshold_warning | | string | `0` | no | -| client_name | | string | - | yes | -| d2c_telemetry_egress_dropped_message | | string | - | yes | -| d2c_telemetry_egress_dropped_threshold_critical | | string | `1000` | no | -| d2c_telemetry_egress_dropped_threshold_warning | | string | `500` | no | -| d2c_telemetry_egress_fallback_message | | string | - | yes | -| d2c_telemetry_egress_fallback_threshold_critical | | string | `1000` | no | -| d2c_telemetry_egress_fallback_threshold_warning | | string | `500` | no | -| d2c_telemetry_egress_invalid_message | | string | - | yes | -| d2c_telemetry_egress_invalid_threshold_critical | | string | `1000` | no | -| d2c_telemetry_egress_invalid_threshold_warning | | string | `500` | no | -| d2c_telemetry_egress_orphaned_message | | string | - | yes | -| d2c_telemetry_egress_orphaned_threshold_critical | | string | `1000` | no | -| d2c_telemetry_egress_orphaned_threshold_warning | | string | `500` | no | -| d2c_telemetry_ingress_nosent_message | | string | - | yes | -| d2c_twin_read_failed_message | | string | - | yes | -| d2c_twin_read_failed_threshold_critical | | string | `10` | no | -| d2c_twin_read_failed_threshold_warning | | string | `0` | no | -| d2c_twin_update_failed_message | | string | - | yes | -| d2c_twin_update_failed_threshold_critical | | string | `10` | no | -| d2c_twin_update_failed_threshold_warning | | string | `0` | no | -| delay | | string | `600` | no | -| environment | | string | - | yes | -| jobs_failed_message | | string | - | yes | -| jobs_failed_threshold_critical | | string | `10` | no | -| jobs_failed_threshold_warning | # IOT hubs | string | `0` | no | -| listjobs_failed_message | | string | - | yes | -| listjobs_failed_threshold_critical | | string | `10` | no | -| listjobs_failed_threshold_warning | | string | `0` | no | -| queryjobs_failed_message | | string | - | yes | -| queryjobs_failed_threshold_critical | | string | `10` | no | -| queryjobs_failed_threshold_warning | | string | `0` | no | -| stack | | string | - | yes | -| status_message | | string | - | yes | -| subscription_id | | string | - | yes | -| total_devices_message | | string | - | yes | +| c2d_methods_failed_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no | +| c2d_methods_failed_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no | +| c2d_twin_read_failed_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| c2d_twin_read_failed_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| c2d_twin_update_failed_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| c2d_twin_update_failed_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| client_name | Client Name | string | - | yes | +| d2c_telemetry_egress_dropped_threshold_critical | D2C Telemetry Dropped Failed limit (critical threshold) | string | `1000` | no | +| d2c_telemetry_egress_dropped_threshold_warning | D2C Telemetry Dropped Failed limit (warning threshold) | string | `500` | no | +| d2c_telemetry_egress_fallback_threshold_critical | D2C Telemetry Fallback Failed limit (critical threshold) | string | `1000` | no | +| d2c_telemetry_egress_fallback_threshold_warning | D2C Telemetry Fallback Failed limit (warning threshold) | string | `500` | no | +| d2c_telemetry_egress_invalid_threshold_critical | D2C Telemetry Invalid Failed limit (critical threshold) | string | `1000` | no | +| d2c_telemetry_egress_invalid_threshold_warning | D2C Telemetry Invalid Failed limit (warning threshold) | string | `500` | no | +| d2c_telemetry_egress_orphaned_threshold_critical | D2C Telemetry Orphaned Failed limit (critical threshold) | string | `1000` | no | +| d2c_telemetry_egress_orphaned_threshold_warning | D2C Telemetry Orphaned Failed limit (warning threshold) | string | `500` | no | +| d2c_twin_read_failed_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| d2c_twin_read_failed_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| d2c_twin_update_failed_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| d2c_twin_update_failed_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| environment | Architecture Environment | string | - | yes | +| jobs_failed_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no | +| jobs_failed_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no | +| listjobs_failed_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | +| listjobs_failed_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | +| message | Message sent when an alert is triggered | string | - | yes | +| queryjobs_failed_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | +| queryjobs_failed_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | +| subscription_id | Subscription ID used to tag monitors | string | - | yes | Related documentation --------------------- -DataDog documentation: https://docs.datadoghq.com/integrations/azure_iot_hub/ +DataDog documentation: [https://docs.datadoghq.com/integrations/azure_iot_hub](https://docs.datadoghq.com/integrations/azure_iot_hub) -Azure IOT Hubs metrics documentation: https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health \ No newline at end of file +Azure IOT Hubs metrics documentation: [https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health](https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health) \ No newline at end of file diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 5ae0587..cc591cd 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -1,138 +1,144 @@ -variable "environment" {} +variable "environment" { + description = "Architecture Environment" + type = "string" +} -variable "stack" {} +variable "client_name" { + description = "Client Name" + type = "string" +} -variable "client_name" {} - -variable "subscription_id" {} +variable "subscription_id" { + description = "Subscription ID used to tag monitors" + type = "string" +} variable "delay" { + description = "Delay in seconds for the metric evaluation" default = 600 } +variable "message" { + description = "Message sent when an alert is triggered" +} + ## IOT hubs variable "jobs_failed_threshold_warning" { + description = "Jobs Failed rate limit (warning threshold)" default = 0 } variable "jobs_failed_threshold_critical" { + description = "Jobs Failed rate limit (critical threshold)" default = 10 } -variable "jobs_failed_message" {} - variable "listjobs_failed_threshold_warning" { + description = "ListJobs Failed rate limit (warning threshold)" default = 0 } variable "listjobs_failed_threshold_critical" { + description = "ListJobs Failed rate limit (critical threshold)" default = 10 } -variable "listjobs_failed_message" {} - variable "queryjobs_failed_threshold_warning" { + description = "QueryJobs Failed rate limit (warning threshold)" default = 0 } variable "queryjobs_failed_threshold_critical" { + description = "QueryJobs Failed rate limit (critical threshold)" default = 10 } -variable "queryjobs_failed_message" {} - -variable "status_message" {} - -variable "total_devices_message" {} - variable "c2d_methods_failed_threshold_warning" { + description = "C2D Methods Failed rate limit (warning threshold)" default = 0 } variable "c2d_methods_failed_threshold_critical" { + description = "C2D Methods Failed rate limit (critical threshold)" default = 10 } -variable "c2d_methods_failed_message" {} - variable "c2d_twin_read_failed_threshold_warning" { + description = "C2D Twin Read Failed rate limit (warning threshold)" default = 0 } variable "c2d_twin_read_failed_threshold_critical" { + description = "C2D Twin Read Failed rate limit (critical threshold)" default = 10 } -variable "c2d_twin_read_failed_message" {} - variable "c2d_twin_update_failed_threshold_warning" { + description = "C2D Twin Update Failed rate limit (warning threshold)" default = 0 } variable "c2d_twin_update_failed_threshold_critical" { + description = "C2D Twin Update Failed rate limit (critical threshold)" default = 10 } -variable "c2d_twin_update_failed_message" {} - variable "d2c_twin_read_failed_threshold_warning" { + description = "D2C Twin Read Failed rate limit (warning threshold)" default = 0 } variable "d2c_twin_read_failed_threshold_critical" { + description = "D2C Twin Read Failed rate limit (critical threshold)" default = 10 } -variable "d2c_twin_read_failed_message" {} - variable "d2c_twin_update_failed_threshold_warning" { + description = "D2C Twin Update Failed rate limit (warning threshold)" default = 0 } variable "d2c_twin_update_failed_threshold_critical" { + description = "D2C Twin Update Failed rate limit (critical threshold)" default = 10 } -variable "d2c_twin_update_failed_message" {} - variable "d2c_telemetry_egress_dropped_threshold_warning" { + description = "D2C Telemetry Dropped Failed limit (warning threshold)" default = 500 } variable "d2c_telemetry_egress_dropped_threshold_critical" { + description = "D2C Telemetry Dropped Failed limit (critical threshold)" default = 1000 } -variable "d2c_telemetry_egress_dropped_message" {} - variable "d2c_telemetry_egress_orphaned_threshold_warning" { + description = "D2C Telemetry Orphaned Failed limit (warning threshold)" default = 500 } variable "d2c_telemetry_egress_orphaned_threshold_critical" { + description = "D2C Telemetry Orphaned Failed limit (critical threshold)" default = 1000 } -variable "d2c_telemetry_egress_orphaned_message" {} - variable "d2c_telemetry_egress_invalid_threshold_warning" { + description = "D2C Telemetry Invalid Failed limit (warning threshold)" default = 500 } variable "d2c_telemetry_egress_invalid_threshold_critical" { + description = "D2C Telemetry Invalid Failed limit (critical threshold)" default = 1000 } -variable "d2c_telemetry_egress_invalid_message" {} - variable "d2c_telemetry_egress_fallback_threshold_warning" { + description = "D2C Telemetry Fallback Failed limit (warning threshold)" default = 500 } variable "d2c_telemetry_egress_fallback_threshold_critical" { + description = "D2C Telemetry Fallback Failed limit (critical threshold)" default = 1000 } - -variable "d2c_telemetry_egress_fallback_message" {} - -variable "d2c_telemetry_ingress_nosent_message" {} diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 4c59099..f4a7073 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -1,8 +1,14 @@ resource "datadog_monitor" "too_many_jobs_failed" { name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}" - message = "${var.jobs_failed_message}" + message = "${var.message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" + query = < ${var.jobs_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -24,9 +30,15 @@ resource "datadog_monitor" "too_many_jobs_failed" { resource "datadog_monitor" "too_many_list_jobs_failed" { name = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}" - message = "${var.listjobs_failed_message}" + message = "${var.message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" + query = < ${var.listjobs_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -48,9 +60,15 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { resource "datadog_monitor" "too_many_query_jobs_failed" { name = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}" - message = "${var.queryjobs_failed_message}" + message = "${var.message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" + query = < ${var.queryjobs_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -72,9 +90,11 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { resource "datadog_monitor" "status" { name = "[${var.environment}] IOT Hub Status is not ok on {{name}}" - message = "${var.status_message}" + message = "${var.message}" - query = "avg(last_5m):avg:azure.devices_iothubs.status{subscription_id:${var.subscription_id}} by {name,resource_group} < 1" + query = < ${var.c2d_methods_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -134,9 +162,15 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { resource "datadog_monitor" "too_many_c2d_twin_read_failed" { name = "[${var.environment}] IOT Hub Too many c2d twin read failure on {{name}}" - message = "${var.c2d_twin_read_failed_message}" + message = "${var.message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}" + query = < ${var.c2d_twin_read_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -158,9 +192,15 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { resource "datadog_monitor" "too_many_c2d_twin_update_failed" { name = "[${var.environment}] IOT Hub Too many c2d twin update failure on {{name}}" - message = "${var.c2d_twin_update_failed_message}" + message = "${var.message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}" + query = < ${var.c2d_twin_update_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -182,9 +222,15 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { resource "datadog_monitor" "too_many_d2c_twin_read_failed" { name = "[${var.environment}] IOT Hub Too many d2c twin read failure on {{name}}" - message = "${var.d2c_twin_read_failed_message}" + message = "${var.message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}" + query = < ${var.d2c_twin_read_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -206,9 +252,15 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { resource "datadog_monitor" "too_many_d2c_twin_update_failed" { name = "[${var.environment}] IOT Hub Too many d2c twin update failure on {{name}}" - message = "${var.d2c_twin_update_failed_message}" + message = "${var.message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}" + query = < ${var.d2c_twin_update_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -230,9 +282,13 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped on {{name}}" - message = "${var.d2c_telemetry_egress_dropped_message}" + message = "${var.message}" - query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_dropped_threshold_critical}" + query = < ${var.d2c_telemetry_egress_dropped_threshold_critical} + EOF type = "query alert" thresholds { @@ -254,9 +310,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned on {{name}}" - message = "${var.d2c_telemetry_egress_orphaned_message}" + message = "${var.message}" - query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_orphaned_threshold_critical}" + query = < ${var.d2c_telemetry_egress_orphaned_threshold_critical} + EOF type = "query alert" thresholds { @@ -278,9 +338,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid on {{name}}" - message = "${var.d2c_telemetry_egress_invalid_message}" + message = "${var.message}" - query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_invalid_threshold_critical}" + query = < ${var.d2c_telemetry_egress_invalid_threshold_critical} + EOF type = "query alert" thresholds { @@ -302,9 +366,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { name = "[${var.environment}] IOT Hub Too many d2c telemetry egress fallback on {{name}}" - message = "${var.d2c_telemetry_egress_fallback_message}" + message = "${var.message}" - query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_fallback_threshold_critical}" + query = < ${var.d2c_telemetry_egress_fallback_threshold_critical} + EOF type = "query alert" thresholds { @@ -326,9 +394,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress no sent on {{name}}" - message = "${var.d2c_telemetry_ingress_nosent_message}" + message = "${var.message}" - query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() - avg:azure.devices_iothubs.d2c.telemetry.ingress.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() > 0" + query = < 0 + EOF type = "query alert" notify_no_data = false From 9186c6915042ad8f969e05e94ce5db7a5a6fc188 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 31 Oct 2017 15:37:13 +0100 Subject: [PATCH 24/93] MON-80 Now support use_filter_tags --- cloud/azure/iothubs/inputs.tf | 5 +++++ cloud/azure/iothubs/monitors-iothubs.tf | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index cc591cd..d04d03b 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -8,6 +8,11 @@ variable "client_name" { type = "string" } +variable "use_filter_tags" { + description = "Filter the data with service tags if true" + default = "true" +} + variable "subscription_id" { description = "Subscription ID used to tag monitors" type = "string" diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index f4a7073..1ee29a3 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -1,3 +1,11 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_appservices:enabled,subscription_id:%s,env:%s", var.subscription_id,var.environment) : var.subscription_id}" + } +} + resource "datadog_monitor" "too_many_jobs_failed" { name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}" message = "${var.message}" From 0b896d784b0db61fd975fe4876ca896e25c4c3ad Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 31 Oct 2017 15:41:41 +0100 Subject: [PATCH 25/93] MON-78 Add Stream Analytics on several names to be more specific --- cloud/azure/stream-analytics/monitors-stream-analytics.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 8824410..6e6f651 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -34,7 +34,7 @@ resource "datadog_monitor" "su_utilization" { } resource "datadog_monitor" "failed_function_requests" { - name = "[${var.environment}] More than ${var.function_requests_threshold_critical} failed function requests on {{name}}" + name = "[${var.environment}] Stream Analytics : More than ${var.function_requests_threshold_critical} failed function requests on {{name}}" message = "${var.message}" query = < Date: Fri, 3 Nov 2017 20:21:44 +0100 Subject: [PATCH 26/93] MON-78 add subscription_id and tags --- cloud/azure/stream-analytics/README.md | 3 +- cloud/azure/stream-analytics/inputs.tf | 33 +++++++++++++++---- .../monitors-stream-analytics.tf | 18 +++++++--- 3 files changed, 41 insertions(+), 13 deletions(-) diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md index 83d0af4..f115e70 100644 --- a/cloud/azure/stream-analytics/README.md +++ b/cloud/azure/stream-analytics/README.md @@ -9,8 +9,8 @@ module "datadog-monitors-azure-redis" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/stream-analytics?ref={revision}" message = "${module.datadog-message-alerting.alerting-message}" - environment = "${var.environment}" + subscription_id = "${var.subscription_id}" } ``` @@ -31,6 +31,7 @@ Inputs | runtime_errors_threshold_warning | | string | `0` | no | | su_utilization_threshold_critical | | string | `80` | no | | su_utilization_threshold_warning | Monitor specific | string | `60` | no | +| subscription_id | Azure account id used as filter for monitors | string | - | yes | | use_filter_tags | Filter the data with service tags if true | string | `true` | no | Related documentation diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index 29db469..8160547 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -8,14 +8,28 @@ variable "message" { description = "Message sent when a monitor is triggered" } -# Global DataDog -variable "use_filter_tags" { - description = "Filter the data with service tags if true" - default = "true" +variable "subscription_id" { + description = "Azure account id used as filter for monitors" + type = "string" } -variable "notify_no_data" { - default = "false" +variable "provider" { + description = "Cloud provider which the monitor and its based metric depend on" + type = "string" + default = "azure" +} + +variable "service" { + description = "Service monitored by this set of monitors" + type = "string" + default = "storage" +} + +# Global DataDog + + +variable "message" { + description = "Message sent when a Redis monitor is triggered" } variable "delay" { @@ -23,7 +37,12 @@ variable "delay" { default = 600 } -# Monitor specific +variable "use_filter_tags" { + description = "Filter the data with service tags if true" + default = "true" +} + +# Azure Stream Analytics specific variable "su_utilization_threshold_warning" { default = 60 } diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 6e6f651..e464dd4 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -2,12 +2,12 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_stream_analytics:enabled,env:%s", var.environment) : "*"}" + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}" } } resource "datadog_monitor" "su_utilization" { - name = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" + name = "[${var.environment}] Stram Analytics streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" message = "${var.message}" query = < Date: Fri, 3 Nov 2017 20:28:05 +0100 Subject: [PATCH 27/93] MON-78 add subscription_id and tags --- cloud/azure/stream-analytics/README.md | 9 ++++----- cloud/azure/stream-analytics/inputs.tf | 10 ++++++++-- .../stream-analytics/monitors-stream-analytics.tf | 11 +++++++---- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md index f115e70..28e3e2b 100644 --- a/cloud/azure/stream-analytics/README.md +++ b/cloud/azure/stream-analytics/README.md @@ -19,14 +19,13 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| conversion_errors_threshold_critical | | string | `10` | no | -| conversion_errors_threshold_warning | | string | `0` | no | +| conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no | +| conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| function_requests_threshold_critical | | string | `10` | no | -| function_requests_threshold_warning | | string | `0` | no | +| function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no | +| function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | | message | Message sent when a monitor is triggered | string | - | yes | -| notify_no_data | | string | `false` | no | | runtime_errors_threshold_critical | | string | `10` | no | | runtime_errors_threshold_warning | | string | `0` | no | | su_utilization_threshold_critical | | string | `80` | no | diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index 8160547..16807c8 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -26,8 +26,6 @@ variable "service" { } # Global DataDog - - variable "message" { description = "Message sent when a Redis monitor is triggered" } @@ -44,33 +42,41 @@ variable "use_filter_tags" { # Azure Stream Analytics specific variable "su_utilization_threshold_warning" { + description = "Streaming Unit utilization rate limit (warning threshold)" default = 60 } variable "su_utilization_threshold_critical" { + description = "Streaming Unit utilization rate limit (critical threshold)" default = 80 } variable "function_requests_threshold_warning" { + description = "Failed Function Request rate limit (warning threshold)" default = 0 } variable "function_requests_threshold_critical" { + description = "Failed Function Request rate limit (critical threshold)" default = 10 } variable "conversion_errors_threshold_warning" { + description = "Conversion errors limit (warning threshold)" default = 0 } variable "conversion_errors_threshold_critical" { + description = "Conversion errors limit (critical threshold)" default = 10 } variable "runtime_errors_threshold_warning" { + description = "Runtime errors limit (warning threshold)" default = 0 } variable "runtime_errors_threshold_critical" { + description = "Runtime errors limit (critical threshold)" default = 10 } diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index e464dd4..0ecb513 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -41,8 +41,9 @@ resource "datadog_monitor" "failed_function_requests" { query = < ${var.function_requests_threshold_critical} + avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() / + avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() + ) * 100 > ${var.function_requests_threshold_critical} EOF type = "query alert" @@ -66,7 +67,8 @@ resource "datadog_monitor" "failed_function_requests" { resource "datadog_monitor" "conversion_errors" { name = "[${var.environment}] Stream Analytics more than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}" - message = "${var.message}" + # Hard Coded Message while we don't know how to configure warning and critical thresholds + message = "@FR-CloudPublic-run@fr.clara.net" query = < Date: Fri, 3 Nov 2017 20:35:35 +0100 Subject: [PATCH 28/93] MON-80 add tags --- cloud/azure/iothubs/README.md | 6 +- cloud/azure/iothubs/inputs.tf | 34 +++++---- cloud/azure/iothubs/monitors-iothubs.tf | 96 ++++++++++++++++--------- 3 files changed, 85 insertions(+), 51 deletions(-) diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md index 3d6bb91..a0e4be5 100644 --- a/cloud/azure/iothubs/README.md +++ b/cloud/azure/iothubs/README.md @@ -9,10 +9,7 @@ module "iothubs" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors" message = "${module.datadog-message-alerting.alerting-message}" - environment = "${var.environment}" - stack = "${var.stack}" - client_name = "${var.client_name}" subscription_id = "${var.subscription_id}" } ``` @@ -48,7 +45,6 @@ Inputs | c2d_twin_read_failed_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | | c2d_twin_update_failed_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | | c2d_twin_update_failed_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | -| client_name | Client Name | string | - | yes | | d2c_telemetry_egress_dropped_threshold_critical | D2C Telemetry Dropped Failed limit (critical threshold) | string | `1000` | no | | d2c_telemetry_egress_dropped_threshold_warning | D2C Telemetry Dropped Failed limit (warning threshold) | string | `500` | no | | d2c_telemetry_egress_fallback_threshold_critical | D2C Telemetry Fallback Failed limit (critical threshold) | string | `1000` | no | @@ -77,4 +73,4 @@ Related documentation DataDog documentation: [https://docs.datadoghq.com/integrations/azure_iot_hub](https://docs.datadoghq.com/integrations/azure_iot_hub) -Azure IOT Hubs metrics documentation: [https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health](https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health) \ No newline at end of file +Azure IOT Hubs metrics documentation: [https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health](https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index d04d03b..1efabc3 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -1,23 +1,26 @@ +# Global Terraform variable "environment" { description = "Architecture Environment" type = "string" } -variable "client_name" { - description = "Client Name" - type = "string" -} - -variable "use_filter_tags" { - description = "Filter the data with service tags if true" - default = "true" -} - variable "subscription_id" { - description = "Subscription ID used to tag monitors" - type = "string" + description = "Azure account id used as filter for monitors" + type = "string" } +variable "provider" { + description = "Cloud provider which the monitor and its based metric depend on" + type = "string" + default = "azure" +} + +variable "service" { + description = "Service monitored by this set of monitors" + type = "string" + default = "storage" + +# Global DataDog variable "delay" { description = "Delay in seconds for the metric evaluation" default = 600 @@ -27,7 +30,12 @@ variable "message" { description = "Message sent when an alert is triggered" } -## IOT hubs +variable "use_filter_tags" { + description = "Filter the data with service tags if true" + default = "true" +} + +# Azure IOT hubs specific variable "jobs_failed_threshold_warning" { description = "Jobs Failed rate limit (warning threshold)" default = 0 diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 1ee29a3..4398f5f 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -2,7 +2,7 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_appservices:enabled,subscription_id:%s,env:%s", var.subscription_id,var.environment) : var.subscription_id}" + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "${data.template_file.filter.rendered}"}" } } @@ -12,9 +12,9 @@ resource "datadog_monitor" "too_many_jobs_failed" { query = < ${var.jobs_failed_threshold_critical} EOF type = "query alert" @@ -34,6 +34,8 @@ resource "datadog_monitor" "too_many_jobs_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_list_jobs_failed" { @@ -42,9 +44,9 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { query = < ${var.listjobs_failed_threshold_critical} EOF type = "query alert" @@ -64,6 +66,8 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_query_jobs_failed" { @@ -72,9 +76,9 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { query = < ${var.queryjobs_failed_threshold_critical} EOF type = "query alert" @@ -94,6 +98,8 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "status" { @@ -101,7 +107,7 @@ resource "datadog_monitor" "status" { message = "${var.message}" query = < ${var.c2d_methods_failed_threshold_critical} EOF type = "query alert" @@ -166,6 +176,8 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_c2d_twin_read_failed" { @@ -174,9 +186,9 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { query = < ${var.c2d_twin_read_failed_threshold_critical} EOF type = "query alert" @@ -196,6 +208,8 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_c2d_twin_update_failed" { @@ -204,9 +218,9 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { query = < ${var.c2d_twin_update_failed_threshold_critical} EOF type = "query alert" @@ -226,6 +240,8 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_twin_read_failed" { @@ -234,9 +250,9 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { query = < ${var.d2c_twin_read_failed_threshold_critical} EOF type = "query alert" @@ -256,6 +272,8 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_twin_update_failed" { @@ -264,9 +282,9 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { query = < ${var.d2c_twin_update_failed_threshold_critical} EOF type = "query alert" @@ -286,6 +304,8 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { @@ -294,7 +314,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { query = < ${var.d2c_telemetry_egress_dropped_threshold_critical} EOF type = "query alert" @@ -314,6 +334,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { @@ -322,7 +344,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { query = < ${var.d2c_telemetry_egress_orphaned_threshold_critical} EOF type = "query alert" @@ -342,6 +364,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { @@ -350,7 +374,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { query = < ${var.d2c_telemetry_egress_invalid_threshold_critical} EOF type = "query alert" @@ -370,6 +394,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { @@ -378,7 +404,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { query = < ${var.d2c_telemetry_egress_fallback_threshold_critical} EOF type = "query alert" @@ -398,6 +424,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { @@ -406,8 +434,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { query = < 0 EOF type = "query alert" @@ -422,4 +450,6 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } From 2593e5fac4e9da58531e7195af53e5c62802c424 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 3 Nov 2017 20:47:29 +0100 Subject: [PATCH 29/93] MON-80 update readme --- cloud/azure/iothubs/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md index a0e4be5..362e226 100644 --- a/cloud/azure/iothubs/README.md +++ b/cloud/azure/iothubs/README.md @@ -64,8 +64,10 @@ Inputs | listjobs_failed_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | | listjobs_failed_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | | message | Message sent when an alert is triggered | string | - | yes | +| provider | What is the monitored provider | string | azure | no | | queryjobs_failed_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | | queryjobs_failed_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | +| service | What is the monitored service | string | storage | no | | subscription_id | Subscription ID used to tag monitors | string | - | yes | Related documentation From 31f033c35d49109f39bc2e00337c78003fec721e Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 3 Nov 2017 20:51:18 +0100 Subject: [PATCH 30/93] MON-78 update readme --- cloud/azure/stream-analytics/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md index 28e3e2b..dca299b 100644 --- a/cloud/azure/stream-analytics/README.md +++ b/cloud/azure/stream-analytics/README.md @@ -26,10 +26,12 @@ Inputs | function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no | | function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | | message | Message sent when a monitor is triggered | string | - | yes | +| provider | What is the monitored provider | string | azure | no | | runtime_errors_threshold_critical | | string | `10` | no | | runtime_errors_threshold_warning | | string | `0` | no | | su_utilization_threshold_critical | | string | `80` | no | | su_utilization_threshold_warning | Monitor specific | string | `60` | no | +| service | What is the monitored service | string | storage | no | | subscription_id | Azure account id used as filter for monitors | string | - | yes | | use_filter_tags | Filter the data with service tags if true | string | `true` | no | From e0fa47008ae60aa0ba97eb6b2c33d40b0c2e596a Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Mon, 6 Nov 2017 10:30:00 +0100 Subject: [PATCH 31/93] MON-80 Update variables' names --- cloud/azure/iothubs/README.md | 57 ++++----- cloud/azure/iothubs/inputs.tf | 125 ++++++++++---------- cloud/azure/iothubs/monitors-iothubs.tf | 147 +++++++++++++----------- 3 files changed, 173 insertions(+), 156 deletions(-) diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md index 362e226..339b357 100644 --- a/cloud/azure/iothubs/README.md +++ b/cloud/azure/iothubs/README.md @@ -1,5 +1,5 @@ Azure IOT Hubs DataDog monitors -============================ +=============================== How to use this module ---------------------- @@ -39,36 +39,37 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| c2d_methods_failed_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no | -| c2d_methods_failed_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no | -| c2d_twin_read_failed_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no | -| c2d_twin_read_failed_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | -| c2d_twin_update_failed_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | -| c2d_twin_update_failed_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | -| d2c_telemetry_egress_dropped_threshold_critical | D2C Telemetry Dropped Failed limit (critical threshold) | string | `1000` | no | -| d2c_telemetry_egress_dropped_threshold_warning | D2C Telemetry Dropped Failed limit (warning threshold) | string | `500` | no | -| d2c_telemetry_egress_fallback_threshold_critical | D2C Telemetry Fallback Failed limit (critical threshold) | string | `1000` | no | -| d2c_telemetry_egress_fallback_threshold_warning | D2C Telemetry Fallback Failed limit (warning threshold) | string | `500` | no | -| d2c_telemetry_egress_invalid_threshold_critical | D2C Telemetry Invalid Failed limit (critical threshold) | string | `1000` | no | -| d2c_telemetry_egress_invalid_threshold_warning | D2C Telemetry Invalid Failed limit (warning threshold) | string | `500` | no | -| d2c_telemetry_egress_orphaned_threshold_critical | D2C Telemetry Orphaned Failed limit (critical threshold) | string | `1000` | no | -| d2c_telemetry_egress_orphaned_threshold_warning | D2C Telemetry Orphaned Failed limit (warning threshold) | string | `500` | no | -| d2c_twin_read_failed_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no | -| d2c_twin_read_failed_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no | -| d2c_twin_update_failed_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no | -| d2c_twin_update_failed_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | +| dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no | +| dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no | | environment | Architecture Environment | string | - | yes | -| jobs_failed_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no | -| jobs_failed_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no | -| listjobs_failed_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | -| listjobs_failed_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | +| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no | +| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no | +| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no | +| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no | +| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | +| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | +| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | +| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | +| fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no | +| fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no | +| invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no | +| invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no | | message | Message sent when an alert is triggered | string | - | yes | -| provider | What is the monitored provider | string | azure | no | -| queryjobs_failed_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | -| queryjobs_failed_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | -| service | What is the monitored service | string | storage | no | -| subscription_id | Subscription ID used to tag monitors | string | - | yes | +| orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no | +| orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no | +| provider | Cloud provider which the monitor and its based metric depend on | string | `azure` | no | +| service | Service monitored by this set of monitors | string | `storage` | no | +| subscription_id | Azure account id used as filter for monitors | string | - | yes | +| use_filter_tags | Filter the data with service tags if true | string | `true` | no | Related documentation --------------------- diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 1efabc3..01c77fb 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -6,24 +6,25 @@ variable "environment" { variable "subscription_id" { description = "Azure account id used as filter for monitors" - type = "string" + type = "string" } variable "provider" { description = "Cloud provider which the monitor and its based metric depend on" - type = "string" - default = "azure" + type = "string" + default = "azure" } variable "service" { description = "Service monitored by this set of monitors" - type = "string" - default = "storage" + type = "string" + default = "storage" +} # Global DataDog variable "delay" { description = "Delay in seconds for the metric evaluation" - default = 600 + default = 600 } variable "message" { @@ -36,122 +37,122 @@ variable "use_filter_tags" { } # Azure IOT hubs specific -variable "jobs_failed_threshold_warning" { +variable "failed_jobs_rate_threshold_warning" { description = "Jobs Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "jobs_failed_threshold_critical" { +variable "failed_jobs_rate_threshold_critical" { description = "Jobs Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "listjobs_failed_threshold_warning" { +variable "failed_listjobs_rate_threshold_warning" { description = "ListJobs Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "listjobs_failed_threshold_critical" { +variable "failed_listjobs_rate_threshold_critical" { description = "ListJobs Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "queryjobs_failed_threshold_warning" { +variable "failed_queryjobs_rate_threshold_warning" { description = "QueryJobs Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "queryjobs_failed_threshold_critical" { +variable "failed_queryjobs_rate_threshold_critical" { description = "QueryJobs Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "c2d_methods_failed_threshold_warning" { +variable "failed_c2d_methods_rate_threshold_warning" { description = "C2D Methods Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "c2d_methods_failed_threshold_critical" { +variable "failed_c2d_methods_rate_threshold_critical" { description = "C2D Methods Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "c2d_twin_read_failed_threshold_warning" { +variable "failed_c2d_twin_read_rate_threshold_warning" { description = "C2D Twin Read Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "c2d_twin_read_failed_threshold_critical" { +variable "failed_c2d_twin_read_rate_threshold_critical" { description = "C2D Twin Read Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "c2d_twin_update_failed_threshold_warning" { +variable "failed_c2d_twin_update_rate_threshold_warning" { description = "C2D Twin Update Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "c2d_twin_update_failed_threshold_critical" { +variable "failed_c2d_twin_update_rate_threshold_critical" { description = "C2D Twin Update Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "d2c_twin_read_failed_threshold_warning" { +variable "failed_d2c_twin_read_rate_threshold_warning" { description = "D2C Twin Read Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "d2c_twin_read_failed_threshold_critical" { +variable "failed_d2c_twin_read_rate_threshold_critical" { description = "D2C Twin Read Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "d2c_twin_update_failed_threshold_warning" { +variable "failed_d2c_twin_update_rate_threshold_warning" { description = "D2C Twin Update Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "d2c_twin_update_failed_threshold_critical" { +variable "failed_d2c_twin_update_rate_threshold_critical" { description = "D2C Twin Update Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "d2c_telemetry_egress_dropped_threshold_warning" { - description = "D2C Telemetry Dropped Failed limit (warning threshold)" - default = 500 +variable "dropped_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Dropped limit (warning threshold)" + default = 500 } -variable "d2c_telemetry_egress_dropped_threshold_critical" { - description = "D2C Telemetry Dropped Failed limit (critical threshold)" - default = 1000 +variable "dropped_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Dropped limit (critical threshold)" + default = 1000 } -variable "d2c_telemetry_egress_orphaned_threshold_warning" { - description = "D2C Telemetry Orphaned Failed limit (warning threshold)" - default = 500 +variable "orphaned_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Orphaned limit (warning threshold)" + default = 500 } -variable "d2c_telemetry_egress_orphaned_threshold_critical" { - description = "D2C Telemetry Orphaned Failed limit (critical threshold)" - default = 1000 +variable "orphaned_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Orphaned limit (critical threshold)" + default = 1000 } -variable "d2c_telemetry_egress_invalid_threshold_warning" { - description = "D2C Telemetry Invalid Failed limit (warning threshold)" - default = 500 +variable "invalid_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Invalid limit (warning threshold)" + default = 500 } -variable "d2c_telemetry_egress_invalid_threshold_critical" { - description = "D2C Telemetry Invalid Failed limit (critical threshold)" - default = 1000 +variable "invalid_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Invalid limit (critical threshold)" + default = 1000 } -variable "d2c_telemetry_egress_fallback_threshold_warning" { - description = "D2C Telemetry Fallback Failed limit (warning threshold)" - default = 500 +variable "fallback_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Fallback limit (warning threshold)" + default = 500 } -variable "d2c_telemetry_egress_fallback_threshold_critical" { - description = "D2C Telemetry Fallback Failed limit (critical threshold)" - default = 1000 +variable "fallback_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Fallback limit (critical threshold)" + default = 1000 } diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 4398f5f..d7fb7e3 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -15,13 +15,14 @@ resource "datadog_monitor" "too_many_jobs_failed" { avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() ) - ) * 100 > ${var.jobs_failed_threshold_critical} + ) * 100 > ${var.failed_jobs_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.jobs_failed_threshold_warning}" - critical = "${var.jobs_failed_threshold_critical}" + warning = "${var.failed_jobs_rate_threshold_warning}" + critical = "${var.failed_jobs_rate_threshold_critical}" } notify_no_data = false @@ -35,7 +36,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_list_jobs_failed" { @@ -47,13 +48,14 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { avg:azure.devices_iothubs.jobs.list_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() ) - ) * 100 > ${var.listjobs_failed_threshold_critical} + ) * 100 > ${var.failed_listjobs_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.listjobs_failed_threshold_warning}" - critical = "${var.listjobs_failed_threshold_critical}" + warning = "${var.failed_listjobs_rate_threshold_warning}" + critical = "${var.failed_listjobs_rate_threshold_critical}" } notify_no_data = false @@ -67,7 +69,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_query_jobs_failed" { @@ -79,13 +81,14 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { avg:azure.devices_iothubs.jobs.query_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() ) - ) * 100 > ${var.queryjobs_failed_threshold_critical} + ) * 100 > ${var.failed_queryjobs_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.queryjobs_failed_threshold_warning}" - critical = "${var.queryjobs_failed_threshold_critical}" + warning = "${var.failed_queryjobs_rate_threshold_warning}" + critical = "${var.failed_queryjobs_rate_threshold_critical}" } notify_no_data = false @@ -99,7 +102,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "status" { @@ -109,7 +112,8 @@ resource "datadog_monitor" "status" { query = < ${var.c2d_methods_failed_threshold_critical} + ) * 100 > ${var.failed_c2d_methods_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.c2d_methods_failed_threshold_warning}" - critical = "${var.c2d_methods_failed_threshold_critical}" + warning = "${var.failed_c2d_methods_rate_threshold_warning}" + critical = "${var.failed_c2d_methods_rate_threshold_critical}" } notify_no_data = false @@ -177,7 +183,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_c2d_twin_read_failed" { @@ -189,13 +195,14 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() ) - ) * 100 > ${var.c2d_twin_read_failed_threshold_critical} + ) * 100 > ${var.failed_c2d_twin_read_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.c2d_twin_read_failed_threshold_warning}" - critical = "${var.c2d_twin_read_failed_threshold_critical}" + warning = "${var.failed_c2d_twin_read_rate_threshold_warning}" + critical = "${var.failed_c2d_twin_read_rate_threshold_critical}" } notify_no_data = false @@ -209,7 +216,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_c2d_twin_update_failed" { @@ -221,13 +228,14 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() ) - ) * 100 > ${var.c2d_twin_update_failed_threshold_critical} + ) * 100 > ${var.failed_c2d_twin_update_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.c2d_twin_update_failed_threshold_warning}" - critical = "${var.c2d_twin_update_failed_threshold_critical}" + warning = "${var.failed_c2d_twin_update_rate_threshold_warning}" + critical = "${var.failed_c2d_twin_update_rate_threshold_critical}" } notify_no_data = false @@ -241,7 +249,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_twin_read_failed" { @@ -253,13 +261,14 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() ) - ) * 100 > ${var.d2c_twin_read_failed_threshold_critical} + ) * 100 > ${var.failed_d2c_twin_read_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.d2c_twin_read_failed_threshold_warning}" - critical = "${var.d2c_twin_read_failed_threshold_critical}" + warning = "${var.failed_d2c_twin_read_rate_threshold_warning}" + critical = "${var.failed_d2c_twin_read_rate_threshold_critical}" } notify_no_data = false @@ -273,7 +282,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_twin_update_failed" { @@ -285,13 +294,14 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() ) - ) * 100 > ${var.d2c_twin_update_failed_threshold_critical} + ) * 100 > ${var.failed_d2c_twin_update_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.d2c_twin_update_failed_threshold_warning}" - critical = "${var.d2c_twin_update_failed_threshold_critical}" + warning = "${var.failed_d2c_twin_update_rate_threshold_warning}" + critical = "${var.failed_d2c_twin_update_rate_threshold_critical}" } notify_no_data = false @@ -305,7 +315,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { @@ -315,13 +325,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { query = < ${var.d2c_telemetry_egress_dropped_threshold_critical} + ) > ${var.dropped_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.d2c_telemetry_egress_dropped_threshold_warning}" - critical = "${var.d2c_telemetry_egress_dropped_threshold_critical}" + warning = "${var.dropped_d2c_telemetry_egress_threshold_warning}" + critical = "${var.dropped_d2c_telemetry_egress_threshold_critical}" } notify_no_data = false @@ -335,7 +346,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { @@ -345,13 +356,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { query = < ${var.d2c_telemetry_egress_orphaned_threshold_critical} + ) > ${var.orphaned_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.d2c_telemetry_egress_orphaned_threshold_warning}" - critical = "${var.d2c_telemetry_egress_orphaned_threshold_critical}" + warning = "${var.orphaned_d2c_telemetry_egress_threshold_warning}" + critical = "${var.orphaned_d2c_telemetry_egress_threshold_critical}" } notify_no_data = false @@ -365,7 +377,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { @@ -375,13 +387,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { query = < ${var.d2c_telemetry_egress_invalid_threshold_critical} + ) > ${var.invalid_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.d2c_telemetry_egress_invalid_threshold_warning}" - critical = "${var.d2c_telemetry_egress_invalid_threshold_critical}" + warning = "${var.invalid_d2c_telemetry_egress_threshold_warning}" + critical = "${var.invalid_d2c_telemetry_egress_threshold_critical}" } notify_no_data = false @@ -395,7 +408,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { @@ -405,13 +418,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { query = < ${var.d2c_telemetry_egress_fallback_threshold_critical} + ) > ${var.fallback_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.d2c_telemetry_egress_fallback_threshold_warning}" - critical = "${var.d2c_telemetry_egress_fallback_threshold_critical}" + warning = "${var.fallback_d2c_telemetry_egress_threshold_warning}" + critical = "${var.fallback_d2c_telemetry_egress_threshold_critical}" } notify_no_data = false @@ -425,7 +439,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { @@ -438,7 +452,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { avg:azure.devices_iothubs.d2c.telemetry.ingress.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() ) > 0 EOF - type = "query alert" + + type = "query alert" notify_no_data = false evaluation_delay = "${var.delay}" @@ -451,5 +466,5 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } From 279778ed888f891f8a30d033b004275757c904ff Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Thu, 23 Nov 2017 15:12:54 +0100 Subject: [PATCH 32/93] MON-80 Normalize monitors --- cloud/azure/iothubs/README.md | 11 ++--- cloud/azure/iothubs/inputs.tf | 26 +++-------- cloud/azure/iothubs/monitors-iothubs.tf | 62 ++++++++++++------------- 3 files changed, 42 insertions(+), 57 deletions(-) diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md index 339b357..5187715 100644 --- a/cloud/azure/iothubs/README.md +++ b/cloud/azure/iothubs/README.md @@ -8,9 +8,8 @@ How to use this module module "iothubs" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors" - message = "${module.datadog-message-alerting.alerting-message}" - environment = "${var.environment}" - subscription_id = "${var.subscription_id}" + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" } ``` @@ -61,15 +60,13 @@ Inputs | failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | | fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no | | fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no | | invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no | | message | Message sent when an alert is triggered | string | - | yes | | orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no | | orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no | -| provider | Cloud provider which the monitor and its based metric depend on | string | `azure` | no | -| service | Service monitored by this set of monitors | string | `storage` | no | -| subscription_id | Azure account id used as filter for monitors | string | - | yes | -| use_filter_tags | Filter the data with service tags if true | string | `true` | no | Related documentation --------------------- diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 01c77fb..1b1348f 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -4,23 +4,6 @@ variable "environment" { type = "string" } -variable "subscription_id" { - description = "Azure account id used as filter for monitors" - type = "string" -} - -variable "provider" { - description = "Cloud provider which the monitor and its based metric depend on" - type = "string" - default = "azure" -} - -variable "service" { - description = "Service monitored by this set of monitors" - type = "string" - default = "storage" -} - # Global DataDog variable "delay" { description = "Delay in seconds for the metric evaluation" @@ -31,11 +14,16 @@ variable "message" { description = "Message sent when an alert is triggered" } -variable "use_filter_tags" { - description = "Filter the data with service tags if true" +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" default = "true" } +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + # Azure IOT hubs specific variable "failed_jobs_rate_threshold_warning" { description = "Jobs Failed rate limit (warning threshold)" diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index d7fb7e3..6e1f926 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -2,7 +2,7 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "${data.template_file.filter.rendered}"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_iothub:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } @@ -18,7 +18,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { ) * 100 > ${var.failed_jobs_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_jobs_rate_threshold_warning}" @@ -36,7 +36,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_list_jobs_failed" { @@ -51,7 +51,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { ) * 100 > ${var.failed_listjobs_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_listjobs_rate_threshold_warning}" @@ -69,7 +69,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_query_jobs_failed" { @@ -84,7 +84,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { ) * 100 > ${var.failed_queryjobs_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_queryjobs_rate_threshold_warning}" @@ -102,7 +102,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "status" { @@ -113,7 +113,7 @@ resource "datadog_monitor" "status" { avg(last_5m):avg:azure.devices_iothubs.status{${data.template_file.filter.rendered}} by {name,resource_group} < 1 EOF - type = "query alert" + type = "metric alert" notify_no_data = true evaluation_delay = "${var.delay}" @@ -126,7 +126,7 @@ resource "datadog_monitor" "status" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "total_devices" { @@ -137,7 +137,7 @@ resource "datadog_monitor" "total_devices" { avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{${data.template_file.filter.rendered}} by {name,resource_group} == 0 EOF - type = "query alert" + type = "metric alert" notify_no_data = true evaluation_delay = "${var.delay}" @@ -150,7 +150,7 @@ resource "datadog_monitor" "total_devices" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_c2d_methods_failed" { @@ -165,7 +165,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { ) * 100 > ${var.failed_c2d_methods_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_c2d_methods_rate_threshold_warning}" @@ -183,7 +183,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_c2d_twin_read_failed" { @@ -198,7 +198,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { ) * 100 > ${var.failed_c2d_twin_read_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_c2d_twin_read_rate_threshold_warning}" @@ -216,7 +216,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_c2d_twin_update_failed" { @@ -231,7 +231,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { ) * 100 > ${var.failed_c2d_twin_update_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_c2d_twin_update_rate_threshold_warning}" @@ -249,7 +249,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_d2c_twin_read_failed" { @@ -264,7 +264,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { ) * 100 > ${var.failed_d2c_twin_read_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_d2c_twin_read_rate_threshold_warning}" @@ -282,7 +282,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_d2c_twin_update_failed" { @@ -297,7 +297,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { ) * 100 > ${var.failed_d2c_twin_update_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_d2c_twin_update_rate_threshold_warning}" @@ -315,7 +315,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { @@ -328,7 +328,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { ) > ${var.dropped_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.dropped_d2c_telemetry_egress_threshold_warning}" @@ -346,7 +346,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { @@ -359,7 +359,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { ) > ${var.orphaned_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.orphaned_d2c_telemetry_egress_threshold_warning}" @@ -377,7 +377,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { @@ -390,7 +390,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { ) > ${var.invalid_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.invalid_d2c_telemetry_egress_threshold_warning}" @@ -408,7 +408,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { @@ -421,7 +421,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { ) > ${var.fallback_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.fallback_d2c_telemetry_egress_threshold_warning}" @@ -439,7 +439,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { @@ -453,7 +453,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { ) > 0 EOF - type = "query alert" + type = "metric alert" notify_no_data = false evaluation_delay = "${var.delay}" @@ -466,5 +466,5 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } From 8afae8b5f44cf60a04a4a6c22e6da5414553a129 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Thu, 23 Nov 2017 15:51:23 +0100 Subject: [PATCH 33/93] MON-78 Normalize monitors & add status monitor --- cloud/azure/stream-analytics/inputs.tf | 32 +++------ .../monitors-stream-analytics.tf | 69 ++++++++++++------- 2 files changed, 53 insertions(+), 48 deletions(-) diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index 16807c8..ae1186a 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -4,27 +4,6 @@ variable "environment" { type = "string" } -variable "message" { - description = "Message sent when a monitor is triggered" -} - -variable "subscription_id" { - description = "Azure account id used as filter for monitors" - type = "string" -} - -variable "provider" { - description = "Cloud provider which the monitor and its based metric depend on" - type = "string" - default = "azure" -} - -variable "service" { - description = "Service monitored by this set of monitors" - type = "string" - default = "storage" -} - # Global DataDog variable "message" { description = "Message sent when a Redis monitor is triggered" @@ -35,11 +14,16 @@ variable "delay" { default = 600 } -variable "use_filter_tags" { - description = "Filter the data with service tags if true" +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" default = "true" } +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + # Azure Stream Analytics specific variable "su_utilization_threshold_warning" { description = "Streaming Unit utilization rate limit (warning threshold)" @@ -56,7 +40,7 @@ variable "function_requests_threshold_warning" { default = 0 } -variable "function_requests_threshold_critical" { +variable "failed_function_requests_threshold_critical" { description = "Failed Function Request rate limit (critical threshold)" default = 10 } diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 0ecb513..f72af1f 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -2,12 +2,35 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_streamanalytics:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } +resource "datadog_monitor" "status" { + name = "[${var.environment}] Stream Analytics Status is not ok on {{name}}" + message = "${var.message}" + + query = < ${var.su_utilization_threshold_critical} EOF - type = "query alert" + type = "metric alert" - notify_no_data = "${var.notify_no_data}" + notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -32,22 +55,22 @@ resource "datadog_monitor" "su_utilization" { critical = "${var.su_utilization_threshold_critical}" } - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] } resource "datadog_monitor" "failed_function_requests" { - name = "[${var.environment}] Stream Analytics more than ${var.function_requests_threshold_critical} failed function requests on {{name}}" + name = "[${var.environment}] Stream Analytics more than ${var.failed_function_requests_threshold_critical} failed function requests on {{name}}" message = "${var.message}" query = < ${var.function_requests_threshold_critical} + ) * 100 > ${var.failed_function_requests_threshold_critical} EOF - type = "query alert" + type = "metric alert" - notify_no_data = "${var.notify_no_data}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 60 notify_audit = false @@ -59,27 +82,26 @@ resource "datadog_monitor" "failed_function_requests" { no_data_timeframe = 20 thresholds { warning = "${var.function_requests_threshold_warning}" - critical = "${var.function_requests_threshold_critical}" + critical = "${var.failed_function_requests_threshold_critical}" } - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] } resource "datadog_monitor" "conversion_errors" { name = "[${var.environment}] Stream Analytics more than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}" - # Hard Coded Message while we don't know how to configure warning and critical thresholds - message = "@FR-CloudPublic-run@fr.clara.net" + message = "${var.message}" query = < ${var.conversion_errors_threshold_critical} EOF - type = "query alert" + type = "metric alert" - notify_no_data = "${var.notify_no_data}" + notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -92,24 +114,23 @@ resource "datadog_monitor" "conversion_errors" { critical = "${var.conversion_errors_threshold_critical}" } - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] } resource "datadog_monitor" "runtime_errors" { name = "[${var.environment}] Stream Analytics more than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}" - # Hard Coded Message while we don't know how to configure warning and critical thresholds - message = "@FR-CloudPublic-run@fr.clara.net" + message = "${var.message}" query = < ${var.runtime_errors_threshold_critical} EOF - type = "query alert" + type = "metric alert" - notify_no_data = "${var.notify_no_data}" + notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -122,5 +143,5 @@ resource "datadog_monitor" "runtime_errors" { critical = "${var.runtime_errors_threshold_critical}" } - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] } From d2e1aa5efddea62258790c9b0afff8dea0d51cf4 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Mon, 30 Oct 2017 11:40:16 +0100 Subject: [PATCH 34/93] MON-77 Azure Event Hub monitors --- cloud/azure/eventhub/inputs.tf | 31 ++++++++ cloud/azure/eventhub/monitors-eventhub.tf | 86 +++++++++++++++++++++++ cloud/azure/eventhub/outputs.tf | 11 +++ 3 files changed, 128 insertions(+) create mode 100644 cloud/azure/eventhub/inputs.tf create mode 100644 cloud/azure/eventhub/monitors-eventhub.tf create mode 100644 cloud/azure/eventhub/outputs.tf diff --git a/cloud/azure/eventhub/inputs.tf b/cloud/azure/eventhub/inputs.tf new file mode 100644 index 0000000..a1c7ec4 --- /dev/null +++ b/cloud/azure/eventhub/inputs.tf @@ -0,0 +1,31 @@ +variable "environment" {} + +variable "down_message" {} + +variable "failed_requests_message" {} + +variable "errors_message" {} + +variable "delay" { + default = 600 +} + +variable "failed_requests_rate_thresold_critical" { + default = 5 +} + +variable "failed_requests_rate_thresold_warning" { + default = 3 +} + +variable "errors_rate_thresold_critical" { + default = 5 +} + +variable "errors_rate_thresold_warning" { + default = 3 +} + +variable "use_filter_tags" { + default = "true" +} diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf new file mode 100644 index 0000000..7c22418 --- /dev/null +++ b/cloud/azure/eventhub/monitors-eventhub.tf @@ -0,0 +1,86 @@ +resource "datadog_monitor" "eventhub_status" { + name = "[${var.environment}] Event Hub status" + message = "${var.down_message}" + + query = < ${var.failed_requests_rate_thresold_critical} + EOF + type = "query alert" + + thresholds { + critical = "${var.failed_requests_rate_thresold_critical}" + warning = "${var.failed_requests_rate_thresold_warning}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "eventhub_errors" { + name = "[${var.environment}] Event Hub errors" + message = "${var.errors_message}" + + query = < ${var.errors_rate_thresold_critical} + EOF + type = "query alert" + + thresholds { + critical = "${var.errors_rate_thresold_critical}" + warning = "${var.errors_rate_thresold_warning}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} diff --git a/cloud/azure/eventhub/outputs.tf b/cloud/azure/eventhub/outputs.tf new file mode 100644 index 0000000..b9d1822 --- /dev/null +++ b/cloud/azure/eventhub/outputs.tf @@ -0,0 +1,11 @@ +output "status_monitor_id" { + value = "${datadog_monitor.eventhub_failed_requests.id}" +} + +output "failed_requests_monitor_id" { + value = "${datadog_monitor.eventhub_status.id}" +} + +output "errors_monitor_id" { + value = "${datadog_monitor.eventhub_errors.id}" +} From 15549efc52e50e03a0b2d5165bdf41a121607947 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Mon, 30 Oct 2017 17:49:02 +0100 Subject: [PATCH 35/93] MON-77 Use data template for tag filter --- cloud/azure/eventhub/monitors-eventhub.tf | 31 +++++++++++++++-------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf index 7c22418..71b97b3 100644 --- a/cloud/azure/eventhub/monitors-eventhub.tf +++ b/cloud/azure/eventhub/monitors-eventhub.tf @@ -1,9 +1,18 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}" + } +} + + resource "datadog_monitor" "eventhub_status" { name = "[${var.environment}] Event Hub status" message = "${var.down_message}" query = < ${var.failed_requests_rate_thresold_critical} EOF type = "query alert" @@ -56,14 +65,14 @@ resource "datadog_monitor" "eventhub_errors" { query = < ${var.errors_rate_thresold_critical} EOF type = "query alert" From 3330aeb9dcb6574deea57d671fcf5faa9cfa528e Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Mon, 30 Oct 2017 18:00:12 +0100 Subject: [PATCH 36/93] MON-77 Fix tag filters --- cloud/azure/eventhub/monitors-eventhub.tf | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf index 71b97b3..2b67590 100644 --- a/cloud/azure/eventhub/monitors-eventhub.tf +++ b/cloud/azure/eventhub/monitors-eventhub.tf @@ -12,7 +12,7 @@ resource "datadog_monitor" "eventhub_status" { message = "${var.down_message}" query = < ${var.failed_requests_rate_thresold_critical} EOF type = "query alert" @@ -65,14 +65,14 @@ resource "datadog_monitor" "eventhub_errors" { query = < ${var.errors_rate_thresold_critical} EOF type = "query alert" From 1768c1621f7fbcce64464c5aa4f19bd217fae538 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Mon, 30 Oct 2017 19:05:40 +0100 Subject: [PATCH 37/93] MON-77 Change monitor type to to fix it --- cloud/azure/eventhub/monitors-eventhub.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf index 2b67590..7600215 100644 --- a/cloud/azure/eventhub/monitors-eventhub.tf +++ b/cloud/azure/eventhub/monitors-eventhub.tf @@ -14,7 +14,7 @@ resource "datadog_monitor" "eventhub_status" { query = < Date: Tue, 31 Oct 2017 08:51:34 +0100 Subject: [PATCH 38/93] MON-77 Some documentation & lower thresold levels --- cloud/azure/eventhub/README.md | 53 +++++++++++++++++++++++ cloud/azure/eventhub/inputs.tf | 26 ++++++----- cloud/azure/eventhub/monitors-eventhub.tf | 12 ++--- 3 files changed, 74 insertions(+), 17 deletions(-) create mode 100644 cloud/azure/eventhub/README.md diff --git a/cloud/azure/eventhub/README.md b/cloud/azure/eventhub/README.md new file mode 100644 index 0000000..a148377 --- /dev/null +++ b/cloud/azure/eventhub/README.md @@ -0,0 +1,53 @@ +Event Hub Datadog monitor +========================= + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-eventhub" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/eventhub?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a Datadog monitor with the following checks : + +* Service status check +* Failed request ratio +* Erroneous requests ratio + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| environment | Architecture environment | string | - | yes | +| errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no | +| errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no | +| failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no | +| failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no | +| message | Message sent when an alert is triggered | string | - | yes | +| use_filter_tags | Filter the data with service tags if true | string | `true` | no | + +Outputs +------- + +| Name | Description | +|------|-------------| +| errors_monitor_id | Id of the `errors` monitor | +| failed_requests_monitor_id | Id of the `failed requests` monitor | +| status_monitor_id | Id of the `status` monitor | + +Related documentation +--------------------- + +Datadog documentation : [https://docs.datadoghq.com/integrations/azure_event_hub/](https://docs.datadoghq.com/integrations/azure_event_hub/) + +Azure metrics documentation : [https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-metrics-azure-monitor](https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-metrics-azure-monitor) diff --git a/cloud/azure/eventhub/inputs.tf b/cloud/azure/eventhub/inputs.tf index a1c7ec4..a67caae 100644 --- a/cloud/azure/eventhub/inputs.tf +++ b/cloud/azure/eventhub/inputs.tf @@ -1,31 +1,35 @@ variable "environment" {} -variable "down_message" {} - -variable "failed_requests_message" {} - -variable "errors_message" {} +variable "message" { + description = "Message sent when an alert is triggered" +} variable "delay" { + description = "Delay in seconds for the metric evaluation" default = 600 } variable "failed_requests_rate_thresold_critical" { - default = 5 + description = "Failed requests ratio (percentage) to trigger the critical alert" + default = 3 } variable "failed_requests_rate_thresold_warning" { - default = 3 + description = "Failed requests ratio (percentage) to trigger a warning alert" + default = 1 } variable "errors_rate_thresold_critical" { - default = 5 -} - -variable "errors_rate_thresold_warning" { + description = "Errors ratio (percentage) to trigger the critical alert" default = 3 } +variable "errors_rate_thresold_warning" { + description = "Errors ratio (percentage) to trigger a warning alert" + default = 1 +} + variable "use_filter_tags" { + description = "Filter the data with service tags if true" default = "true" } diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf index 7600215..efe1351 100644 --- a/cloud/azure/eventhub/monitors-eventhub.tf +++ b/cloud/azure/eventhub/monitors-eventhub.tf @@ -9,7 +9,7 @@ data "template_file" "filter" { resource "datadog_monitor" "eventhub_status" { name = "[${var.environment}] Event Hub status" - message = "${var.down_message}" + message = "${var.message}" query = < Date: Fri, 3 Nov 2017 20:41:57 +0100 Subject: [PATCH 39/93] MON-77 add tags and subscription_id --- cloud/azure/eventhub/README.md | 5 +++- cloud/azure/eventhub/inputs.tf | 34 +++++++++++++++++++---- cloud/azure/eventhub/monitors-eventhub.tf | 11 ++++++-- 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/cloud/azure/eventhub/README.md b/cloud/azure/eventhub/README.md index a148377..6e40955 100644 --- a/cloud/azure/eventhub/README.md +++ b/cloud/azure/eventhub/README.md @@ -9,8 +9,8 @@ module "datadog-monitors-azure-eventhub" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/eventhub?ref={revision}" message = "${module.datadog-message-alerting.alerting-message}" - environment = "${var.environment}" + subscription_id = "${var.subscription_id}" } ``` @@ -29,12 +29,15 @@ Inputs |------|-------------|:----:|:-----:|:-----:| | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | +| provider | What is the monitored provider | string | - | yes | +| service | What is the monitored service | string | - | yes | | errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no | | errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no | | failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no | | failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no | | message | Message sent when an alert is triggered | string | - | yes | | use_filter_tags | Filter the data with service tags if true | string | `true` | no | +| subscription_id | Azure account id used as filter for monitors | string | - | yes | Outputs ------- diff --git a/cloud/azure/eventhub/inputs.tf b/cloud/azure/eventhub/inputs.tf index a67caae..d520dc2 100644 --- a/cloud/azure/eventhub/inputs.tf +++ b/cloud/azure/eventhub/inputs.tf @@ -1,5 +1,27 @@ -variable "environment" {} +# Global Terraform +variable "environment" { + description = "Architecture environment" + type = "string" +} +variable "subscription_id" { + description = "Azure account id used as filter for monitors" + type = "string" +} + +variable "provider" { + description = "Cloud provider which the monitor and its based metric depend on" + type = "string" + default = "azure" +} + +variable "service" { + description = "Service monitored by this set of monitors" + type = "string" + default = "storage" +} + +# Global DataDog variable "message" { description = "Message sent when an alert is triggered" } @@ -9,6 +31,11 @@ variable "delay" { default = 600 } +variable "use_filter_tags" { + description = "Filter the data with service tags if true" + default = "true" +} + variable "failed_requests_rate_thresold_critical" { description = "Failed requests ratio (percentage) to trigger the critical alert" default = 3 @@ -28,8 +55,3 @@ variable "errors_rate_thresold_warning" { description = "Errors ratio (percentage) to trigger a warning alert" default = 1 } - -variable "use_filter_tags" { - description = "Filter the data with service tags if true" - default = "true" -} diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf index efe1351..89a3d8a 100644 --- a/cloud/azure/eventhub/monitors-eventhub.tf +++ b/cloud/azure/eventhub/monitors-eventhub.tf @@ -2,11 +2,10 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}" + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}" } } - resource "datadog_monitor" "eventhub_status" { name = "[${var.environment}] Event Hub status" message = "${var.message}" @@ -26,6 +25,8 @@ resource "datadog_monitor" "eventhub_status" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "eventhub_failed_requests" { @@ -57,6 +58,8 @@ resource "datadog_monitor" "eventhub_failed_requests" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "eventhub_errors" { @@ -91,5 +94,7 @@ resource "datadog_monitor" "eventhub_errors" { locked = false require_full_window = true new_host_delay = "${var.delay}" - no_data_timeframe = 20 + no_data_timeframe = 20o + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } From 205f3e963596dee548183e5a34ec081ab5e6df08 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 3 Nov 2017 20:48:42 +0100 Subject: [PATCH 40/93] MON-77 update readme --- cloud/azure/eventhub/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloud/azure/eventhub/README.md b/cloud/azure/eventhub/README.md index 6e40955..f4db2d6 100644 --- a/cloud/azure/eventhub/README.md +++ b/cloud/azure/eventhub/README.md @@ -29,15 +29,15 @@ Inputs |------|-------------|:----:|:-----:|:-----:| | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| provider | What is the monitored provider | string | - | yes | -| service | What is the monitored service | string | - | yes | | errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no | | errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no | | failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no | | failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no | | message | Message sent when an alert is triggered | string | - | yes | +| provider | What is the monitored provider | string | azure | no | | use_filter_tags | Filter the data with service tags if true | string | `true` | no | | subscription_id | Azure account id used as filter for monitors | string | - | yes | +| service | What is the monitored service | string | storage | no | Outputs ------- From 5df915df51e3f0d17badc0c38b9e6e76770e80fe Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Thu, 23 Nov 2017 16:36:18 +0100 Subject: [PATCH 41/93] MON-77 Fix unattended char --- cloud/azure/eventhub/monitors-eventhub.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf index 89a3d8a..733e141 100644 --- a/cloud/azure/eventhub/monitors-eventhub.tf +++ b/cloud/azure/eventhub/monitors-eventhub.tf @@ -94,7 +94,7 @@ resource "datadog_monitor" "eventhub_errors" { locked = false require_full_window = true new_host_delay = "${var.delay}" - no_data_timeframe = 20o + no_data_timeframe = 20 tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } From 6c10a32ff3303db46f8da3b746a9f1df3a0b35ae Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Thu, 23 Nov 2017 16:50:04 +0100 Subject: [PATCH 42/93] MON-77 Normalize monitors --- cloud/azure/eventhub/README.md | 6 ++---- cloud/azure/eventhub/inputs.tf | 26 ++++++----------------- cloud/azure/eventhub/monitors-eventhub.tf | 18 ++++++++-------- 3 files changed, 18 insertions(+), 32 deletions(-) diff --git a/cloud/azure/eventhub/README.md b/cloud/azure/eventhub/README.md index f4db2d6..b2573da 100644 --- a/cloud/azure/eventhub/README.md +++ b/cloud/azure/eventhub/README.md @@ -33,11 +33,9 @@ Inputs | errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no | | failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no | | failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when an alert is triggered | string | - | yes | -| provider | What is the monitored provider | string | azure | no | -| use_filter_tags | Filter the data with service tags if true | string | `true` | no | -| subscription_id | Azure account id used as filter for monitors | string | - | yes | -| service | What is the monitored service | string | storage | no | Outputs ------- diff --git a/cloud/azure/eventhub/inputs.tf b/cloud/azure/eventhub/inputs.tf index d520dc2..b41fdf5 100644 --- a/cloud/azure/eventhub/inputs.tf +++ b/cloud/azure/eventhub/inputs.tf @@ -4,23 +4,6 @@ variable "environment" { type = "string" } -variable "subscription_id" { - description = "Azure account id used as filter for monitors" - type = "string" -} - -variable "provider" { - description = "Cloud provider which the monitor and its based metric depend on" - type = "string" - default = "azure" -} - -variable "service" { - description = "Service monitored by this set of monitors" - type = "string" - default = "storage" -} - # Global DataDog variable "message" { description = "Message sent when an alert is triggered" @@ -31,11 +14,16 @@ variable "delay" { default = 600 } -variable "use_filter_tags" { - description = "Filter the data with service tags if true" +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" default = "true" } +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + variable "failed_requests_rate_thresold_critical" { description = "Failed requests ratio (percentage) to trigger the critical alert" default = 3 diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf index 733e141..ff52507 100644 --- a/cloud/azure/eventhub/monitors-eventhub.tf +++ b/cloud/azure/eventhub/monitors-eventhub.tf @@ -2,12 +2,12 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } resource "datadog_monitor" "eventhub_status" { - name = "[${var.environment}] Event Hub status" + name = "[${var.environment}] Event Hub status is not ok on {{name}}" message = "${var.message}" query = < ${var.failed_requests_rate_thresold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { critical = "${var.failed_requests_rate_thresold_critical}" @@ -59,11 +59,11 @@ resource "datadog_monitor" "eventhub_failed_requests" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:eventhub", "team:azure", "provider:azure"] } resource "datadog_monitor" "eventhub_errors" { - name = "[${var.environment}] Event Hub errors" + name = "[${var.environment}] Event Hub too much errors on {{name}}" message = "${var.message}" query = < ${var.errors_rate_thresold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { critical = "${var.errors_rate_thresold_critical}" @@ -96,5 +96,5 @@ resource "datadog_monitor" "eventhub_errors" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:eventhub", "team:azure", "provider:azure"] } From 6e6147088cbb58c322031c3f2169001025a0cae3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Mon, 30 Oct 2017 11:34:42 +0100 Subject: [PATCH 43/93] MON-76: Azure Redis - DataDog Monitors --- cloud/azure/redis/inputs.tf | 31 +++++++++++++++++ cloud/azure/redis/monitors-azure-redis.tf | 42 +++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 cloud/azure/redis/inputs.tf create mode 100644 cloud/azure/redis/monitors-azure-redis.tf diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf new file mode 100644 index 0000000..70eba23 --- /dev/null +++ b/cloud/azure/redis/inputs.tf @@ -0,0 +1,31 @@ +# Global Terraform +variable "client_name" { + type = "string" +} + +variable "environment" { + type = "string" +} + +variable "stack" { + type = "string" +} + +# Global DataDog +variable "critical_escalation_group" { +} + +variable "warning_escalation_group" { +} + +variable "delay" { + default = 600 +} + +# Azure Redis specific +variable "evictedkeys_threshold_warning" { + default = 0 +} +variable "evictedkeys_threshold_critical" { + default = 100 +} diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf new file mode 100644 index 0000000..ec562b9 --- /dev/null +++ b/cloud/azure/redis/monitors-azure-redis.tf @@ -0,0 +1,42 @@ +resource "datadog_monitor" "status" { + name = "[${var.environment}] Redis {{name}} is down" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.cache_redis.status{*} by {name,resource_group} != 1" + type = "query alert" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "evictedkeys" { + name = "[${var.environment}] Redis {{value}} evictedkeys on {{name}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.cache_redis.evictedkeys{*} by {name,resource_group} > ${var.evictedkeys_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.evictedkeys_threshold_warning}" + critical = "${var.evictedkeys_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} From 9112ce02a390dc783a85ee6a92b65239b6f35d6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Mon, 30 Oct 2017 16:37:03 +0100 Subject: [PATCH 44/93] MON-76: Uses the generic message --- cloud/azure/redis/inputs.tf | 5 +---- cloud/azure/redis/monitors-azure-redis.tf | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf index 70eba23..3f9460f 100644 --- a/cloud/azure/redis/inputs.tf +++ b/cloud/azure/redis/inputs.tf @@ -12,10 +12,7 @@ variable "stack" { } # Global DataDog -variable "critical_escalation_group" { -} - -variable "warning_escalation_group" { +variable "message" { } variable "delay" { diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf index ec562b9..8b47249 100644 --- a/cloud/azure/redis/monitors-azure-redis.tf +++ b/cloud/azure/redis/monitors-azure-redis.tf @@ -1,6 +1,6 @@ resource "datadog_monitor" "status" { name = "[${var.environment}] Redis {{name}} is down" - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.message}" query = "avg(last_5m):avg:azure.cache_redis.status{*} by {name,resource_group} != 1" type = "query alert" @@ -19,7 +19,7 @@ resource "datadog_monitor" "status" { resource "datadog_monitor" "evictedkeys" { name = "[${var.environment}] Redis {{value}} evictedkeys on {{name}}" - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.message}" query = "avg(last_5m):avg:azure.cache_redis.evictedkeys{*} by {name,resource_group} > ${var.evictedkeys_threshold_critical}" type = "query alert" From 9f1051097e4b42b37f3814a7cb6d139f537ba280 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Mon, 30 Oct 2017 17:44:30 +0100 Subject: [PATCH 45/93] MON-76: More monitors --- cloud/azure/redis/inputs.tf | 20 +++++--- cloud/azure/redis/monitors-azure-redis.tf | 58 +++++++++++++++++++++-- 2 files changed, 67 insertions(+), 11 deletions(-) diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf index 3f9460f..f13b4cb 100644 --- a/cloud/azure/redis/inputs.tf +++ b/cloud/azure/redis/inputs.tf @@ -7,10 +7,6 @@ variable "environment" { type = "string" } -variable "stack" { - type = "string" -} - # Global DataDog variable "message" { } @@ -20,9 +16,21 @@ variable "delay" { } # Azure Redis specific -variable "evictedkeys_threshold_warning" { +variable "evictedkeys_limit_threshold_warning" { default = 0 } -variable "evictedkeys_threshold_critical" { +variable "evictedkeys_limit_threshold_critical" { default = 100 } +variable "percent_processor_time_threshold_critical" { + default = 80 +} +variable "percent_processor_time_threshold_warning" { + default = 60 +} +variable "server_load_rate_threshold_critical" { + default = 90 +} +variable "server_load_rate_threshold_warning" { + default = 70 +} diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf index 8b47249..d4b21b5 100644 --- a/cloud/azure/redis/monitors-azure-redis.tf +++ b/cloud/azure/redis/monitors-azure-redis.tf @@ -7,7 +7,7 @@ resource "datadog_monitor" "status" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -21,17 +21,65 @@ resource "datadog_monitor" "evictedkeys" { name = "[${var.environment}] Redis {{value}} evictedkeys on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.cache_redis.evictedkeys{*} by {name,resource_group} > ${var.evictedkeys_threshold_critical}" + query = "avg(last_5m):avg:azure.cache_redis.evictedkeys{*} by {name,resource_group} > ${var.evictedkeys_limit_threshold_critical}" type = "query alert" thresholds { - warning = "${var.evictedkeys_threshold_warning}" - critical = "${var.evictedkeys_threshold_critical}" + warning = "${var.evictedkeys_limit_threshold_warning}" + critical = "${var.evictedkeys_limit_threshold_critical}" } notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "percent_processor_time" { + name = "[${var.environment}] Redis processor time {{value}}% on {{name}}" + message = "${var.message}" + + query = "avg(last_5m):avg:azure.cache_redis.percent_processor_time{*} by {name,resource_group} > ${var.percent_processor_time_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.percent_processor_time_threshold_warning}" + critical = "${var.percent_processor_time_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "server_load" { + name = "[${var.environment}] Redis processor server load {{value}}% on {{name}}" + message = "${var.message}" + + query = "avg(last_5m):avg:azure.cache_redis.server_load{*} by {name,resource_group} > ${var.server_load_rate_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.server_load_rate_threshold_critical}" + critical = "${var.server_load_rate_threshold_warning}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true From 386ad343a54753b73956e831c28cce39f14088ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 10:34:48 +0100 Subject: [PATCH 46/93] MON-76: Filter tags option --- cloud/azure/redis/inputs.tf | 12 ++++++++++++ cloud/azure/redis/monitors-azure-redis.tf | 16 ++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf index f13b4cb..a96cc51 100644 --- a/cloud/azure/redis/inputs.tf +++ b/cloud/azure/redis/inputs.tf @@ -9,28 +9,40 @@ variable "environment" { # Global DataDog variable "message" { + description = "Message sent when a Redis monitor is triggered" } variable "delay" { + description = "Delay in seconds for the metric evaluation" default = 600 } +variable "use_filter_tags" { + description = "Filter the data with service tags if true" + default = "true" +} + # Azure Redis specific variable "evictedkeys_limit_threshold_warning" { default = 0 } + variable "evictedkeys_limit_threshold_critical" { default = 100 } + variable "percent_processor_time_threshold_critical" { default = 80 } + variable "percent_processor_time_threshold_warning" { default = 60 } + variable "server_load_rate_threshold_critical" { default = 90 } + variable "server_load_rate_threshold_warning" { default = 70 } diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf index d4b21b5..6931afe 100644 --- a/cloud/azure/redis/monitors-azure-redis.tf +++ b/cloud/azure/redis/monitors-azure-redis.tf @@ -1,8 +1,16 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}" + } +} + resource "datadog_monitor" "status" { name = "[${var.environment}] Redis {{name}} is down" message = "${var.message}" - query = "avg(last_5m):avg:azure.cache_redis.status{*} by {name,resource_group} != 1" + query = "avg(last_5m):avg:azure.cache_redis.status{${data.template_file.filter.rendered}} by {name,resource_group} != 1" type = "query alert" notify_no_data = false @@ -21,7 +29,7 @@ resource "datadog_monitor" "evictedkeys" { name = "[${var.environment}] Redis {{value}} evictedkeys on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.cache_redis.evictedkeys{*} by {name,resource_group} > ${var.evictedkeys_limit_threshold_critical}" + query = "avg(last_5m):avg:azure.cache_redis.evictedkeys{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.evictedkeys_limit_threshold_critical}" type = "query alert" thresholds { @@ -45,7 +53,7 @@ resource "datadog_monitor" "percent_processor_time" { name = "[${var.environment}] Redis processor time {{value}}% on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.cache_redis.percent_processor_time{*} by {name,resource_group} > ${var.percent_processor_time_threshold_critical}" + query = "avg(last_5m):avg:azure.cache_redis.percent_processor_time{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.percent_processor_time_threshold_critical}" type = "query alert" thresholds { @@ -69,7 +77,7 @@ resource "datadog_monitor" "server_load" { name = "[${var.environment}] Redis processor server load {{value}}% on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.cache_redis.server_load{*} by {name,resource_group} > ${var.server_load_rate_threshold_critical}" + query = "avg(last_5m):avg:azure.cache_redis.server_load{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.server_load_rate_threshold_critical}" type = "query alert" thresholds { From 8aab6d99b025ff3bf4375a8cc7310ac65edfe749 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 10:38:20 +0100 Subject: [PATCH 47/93] MON-76: Multiple line queries for better readibility --- cloud/azure/redis/monitors-azure-redis.tf | 26 +++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf index 6931afe..b3ad63a 100644 --- a/cloud/azure/redis/monitors-azure-redis.tf +++ b/cloud/azure/redis/monitors-azure-redis.tf @@ -10,7 +10,9 @@ resource "datadog_monitor" "status" { name = "[${var.environment}] Redis {{name}} is down" message = "${var.message}" - query = "avg(last_5m):avg:azure.cache_redis.status{${data.template_file.filter.rendered}} by {name,resource_group} != 1" + query = < ${var.evictedkeys_limit_threshold_critical} +EOF type = "query alert" thresholds { @@ -53,7 +59,11 @@ resource "datadog_monitor" "percent_processor_time" { name = "[${var.environment}] Redis processor time {{value}}% on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.cache_redis.percent_processor_time{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.percent_processor_time_threshold_critical}" + query = < ${var.percent_processor_time_threshold_critical} +EOF type = "query alert" thresholds { @@ -77,12 +87,16 @@ resource "datadog_monitor" "server_load" { name = "[${var.environment}] Redis processor server load {{value}}% on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.cache_redis.server_load{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.server_load_rate_threshold_critical}" + query = < ${var.server_load_rate_threshold_critical} +EOF type = "query alert" thresholds { - warning = "${var.server_load_rate_threshold_critical}" - critical = "${var.server_load_rate_threshold_warning}" + warning = "${var.server_load_rate_threshold_warning}" + critical = "${var.server_load_rate_threshold_critical}" } notify_no_data = false From 505e0df14c86116c1c30a91d64fbc6f352bc2a7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 10:53:58 +0100 Subject: [PATCH 48/93] MON-76: Add Readme --- cloud/azure/redis/README.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 cloud/azure/redis/README.md diff --git a/cloud/azure/redis/README.md b/cloud/azure/redis/README.md new file mode 100644 index 0000000..b5acaaa --- /dev/null +++ b/cloud/azure/redis/README.md @@ -0,0 +1,32 @@ +Azure Redis DataDog monitors +============================ + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-redis" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/redis?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + + environment = "${var.environment}" + client_name = "${var.client_name}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* Service status check +* Evicted keys count check +* Processor time (percent) threshold +* Server CPU load threshold + +Related documentation +--------------------- + +DataDog documentation: https://docs.datadoghq.com/integrations/azure_redis_cache/ + +Azure Redis metrics documentation: https://docs.microsoft.com/en-us/azure/redis-cache/cache-how-to-monitor From 814ee2838da545e1fd75592038e4f7e7fe2dd4f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 11:04:38 +0100 Subject: [PATCH 49/93] MON-76: Readme update with inputs --- cloud/azure/redis/README.md | 17 +++++++++++++++ cloud/azure/redis/inputs.tf | 26 +++++++++++++++-------- cloud/azure/redis/monitors-azure-redis.tf | 12 +++++++---- 3 files changed, 42 insertions(+), 13 deletions(-) diff --git a/cloud/azure/redis/README.md b/cloud/azure/redis/README.md index b5acaaa..d885193 100644 --- a/cloud/azure/redis/README.md +++ b/cloud/azure/redis/README.md @@ -24,6 +24,23 @@ Creates a DataDog monitors with the following checks : * Processor time (percent) threshold * Server CPU load threshold +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| client_name | Client name | string | - | yes | +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| environment | Architecture environment | string | - | yes | +| evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no | +| evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no | +| message | Message sent when a Redis monitor is triggered | string | - | yes | +| percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no | +| percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no | +| server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no | +| server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no | +| use_filter_tags | Filter the data with service tags if true | string | `true` | no | + Related documentation --------------------- diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf index a96cc51..89385e8 100644 --- a/cloud/azure/redis/inputs.tf +++ b/cloud/azure/redis/inputs.tf @@ -1,10 +1,12 @@ # Global Terraform variable "client_name" { - type = "string" + description = "Client name" + type = "string" } variable "environment" { - type = "string" + description = "Architecture environment" + type = "string" } # Global DataDog @@ -14,7 +16,7 @@ variable "message" { variable "delay" { description = "Delay in seconds for the metric evaluation" - default = 600 + default = 600 } variable "use_filter_tags" { @@ -24,25 +26,31 @@ variable "use_filter_tags" { # Azure Redis specific variable "evictedkeys_limit_threshold_warning" { - default = 0 + description = "Evicted keys limit (warning threshold)" + default = 0 } variable "evictedkeys_limit_threshold_critical" { - default = 100 + description = "Evicted keys limit (critical threshold)" + default = 100 } variable "percent_processor_time_threshold_critical" { - default = 80 + description = "Processor time percent (critical threshold)" + default = 80 } variable "percent_processor_time_threshold_warning" { - default = 60 + description = "Processor time percent (warning threshold)" + default = 60 } variable "server_load_rate_threshold_critical" { - default = 90 + description = "Server CPU load rate (critical threshold)" + default = 90 } variable "server_load_rate_threshold_warning" { - default = 70 + description = "Server CPU load rate (warning threshold)" + default = 70 } diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf index b3ad63a..8287dad 100644 --- a/cloud/azure/redis/monitors-azure-redis.tf +++ b/cloud/azure/redis/monitors-azure-redis.tf @@ -13,7 +13,8 @@ resource "datadog_monitor" "status" { query = < ${var.evictedkeys_limit_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { warning = "${var.evictedkeys_limit_threshold_warning}" @@ -64,7 +66,8 @@ resource "datadog_monitor" "percent_processor_time" { avg:azure.cache_redis.percent_processor_time{${data.template_file.filter.rendered}} by {name,resource_group} ) > ${var.percent_processor_time_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { warning = "${var.percent_processor_time_threshold_warning}" @@ -92,7 +95,8 @@ resource "datadog_monitor" "server_load" { avg:azure.cache_redis.server_load{${data.template_file.filter.rendered}} by {name,resource_group} ) > ${var.server_load_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { warning = "${var.server_load_rate_threshold_warning}" From c624b041a42121fc631fd9dcd27497e6351fe9e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 11:09:46 +0100 Subject: [PATCH 50/93] MON-76: Uses the right DD tag for Azure Redis --- cloud/azure/redis/README.md | 4 ++-- cloud/azure/redis/monitors-azure-redis.tf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cloud/azure/redis/README.md b/cloud/azure/redis/README.md index d885193..8520c6b 100644 --- a/cloud/azure/redis/README.md +++ b/cloud/azure/redis/README.md @@ -44,6 +44,6 @@ Inputs Related documentation --------------------- -DataDog documentation: https://docs.datadoghq.com/integrations/azure_redis_cache/ +DataDog documentation: [https://docs.datadoghq.com/integrations/azure_redis_cache/](https://docs.datadoghq.com/integrations/azure_redis_cache/) -Azure Redis metrics documentation: https://docs.microsoft.com/en-us/azure/redis-cache/cache-how-to-monitor +Azure Redis metrics documentation: [https://docs.microsoft.com/en-us/azure/redis-cache/cache-how-to-monitor](https://docs.microsoft.com/en-us/azure/redis-cache/cache-how-to-monitor) diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf index 8287dad..92652e9 100644 --- a/cloud/azure/redis/monitors-azure-redis.tf +++ b/cloud/azure/redis/monitors-azure-redis.tf @@ -2,7 +2,7 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}" + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_redis:enabled,env:%s", var.environment) : "*"}" } } From 0a4345dfa39618213c71e8200153b03fd0bf5645 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 3 Nov 2017 20:56:04 +0100 Subject: [PATCH 51/93] MON-76 add subscription_id and tags, remove client_name --- cloud/azure/redis/README.md | 7 ++++--- cloud/azure/redis/inputs.tf | 24 +++++++++++++++++------ cloud/azure/redis/monitors-azure-redis.tf | 10 +++++++++- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/cloud/azure/redis/README.md b/cloud/azure/redis/README.md index 8520c6b..45538d1 100644 --- a/cloud/azure/redis/README.md +++ b/cloud/azure/redis/README.md @@ -9,9 +9,8 @@ module "datadog-monitors-azure-redis" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/redis?ref={revision}" message = "${module.datadog-message-alerting.alerting-message}" - environment = "${var.environment}" - client_name = "${var.client_name}" + subscription_id = "${var.subscription_id}" } ``` @@ -29,7 +28,6 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| client_name | Client name | string | - | yes | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | | evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no | @@ -37,9 +35,12 @@ Inputs | message | Message sent when a Redis monitor is triggered | string | - | yes | | percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no | | percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no | +| provider | What is the monitored provider | string | azure | no | | server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no | | server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no | | use_filter_tags | Filter the data with service tags if true | string | `true` | no | +| service | What is the monitored service | string | storage | no | +| subscription_id | Azure account id used as filter for monitors | string | - | yes | Related documentation --------------------- diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf index 89385e8..7c57d63 100644 --- a/cloud/azure/redis/inputs.tf +++ b/cloud/azure/redis/inputs.tf @@ -1,12 +1,24 @@ # Global Terraform -variable "client_name" { - description = "Client name" - type = "string" -} - variable "environment" { description = "Architecture environment" - type = "string" + type = "string" +} + +variable "subscription_id" { + description = "Azure account id used as filter for monitors" + type = "string" +} + +variable "provider" { + description = "Cloud provider which the monitor and its based metric depend on" + type = "string" + default = "azure" +} + +variable "service" { + description = "Service monitored by this set of monitors" + type = "string" + default = "storage" } # Global DataDog diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf index 92652e9..950e9a1 100644 --- a/cloud/azure/redis/monitors-azure-redis.tf +++ b/cloud/azure/redis/monitors-azure-redis.tf @@ -2,7 +2,7 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_redis:enabled,env:%s", var.environment) : "*"}" + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}" } } @@ -26,6 +26,8 @@ EOF require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "evictedkeys" { @@ -55,6 +57,8 @@ EOF require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "percent_processor_time" { @@ -84,6 +88,8 @@ EOF require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "server_load" { @@ -113,4 +119,6 @@ EOF require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } From 753da1173437df811d43b65b651ccb1a98d63122 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Thu, 23 Nov 2017 17:12:16 +0100 Subject: [PATCH 52/93] MON-76 Normalize monitors --- cloud/azure/redis/README.md | 9 +++----- cloud/azure/redis/inputs.tf | 26 ++++++----------------- cloud/azure/redis/monitors-azure-redis.tf | 20 ++++++++--------- 3 files changed, 20 insertions(+), 35 deletions(-) diff --git a/cloud/azure/redis/README.md b/cloud/azure/redis/README.md index 45538d1..4cd7a51 100644 --- a/cloud/azure/redis/README.md +++ b/cloud/azure/redis/README.md @@ -8,9 +8,8 @@ How to use this module module "datadog-monitors-azure-redis" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/redis?ref={revision}" - message = "${module.datadog-message-alerting.alerting-message}" + message = "${module.datadog-message-alerting.alerting-message}" environment = "${var.environment}" - subscription_id = "${var.subscription_id}" } ``` @@ -32,15 +31,13 @@ Inputs | environment | Architecture environment | string | - | yes | | evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no | | evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when a Redis monitor is triggered | string | - | yes | | percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no | | percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no | -| provider | What is the monitored provider | string | azure | no | | server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no | | server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no | -| use_filter_tags | Filter the data with service tags if true | string | `true` | no | -| service | What is the monitored service | string | storage | no | -| subscription_id | Azure account id used as filter for monitors | string | - | yes | Related documentation --------------------- diff --git a/cloud/azure/redis/inputs.tf b/cloud/azure/redis/inputs.tf index 7c57d63..49750fa 100644 --- a/cloud/azure/redis/inputs.tf +++ b/cloud/azure/redis/inputs.tf @@ -4,23 +4,6 @@ variable "environment" { type = "string" } -variable "subscription_id" { - description = "Azure account id used as filter for monitors" - type = "string" -} - -variable "provider" { - description = "Cloud provider which the monitor and its based metric depend on" - type = "string" - default = "azure" -} - -variable "service" { - description = "Service monitored by this set of monitors" - type = "string" - default = "storage" -} - # Global DataDog variable "message" { description = "Message sent when a Redis monitor is triggered" @@ -31,11 +14,16 @@ variable "delay" { default = 600 } -variable "use_filter_tags" { - description = "Filter the data with service tags if true" +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" default = "true" } +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + # Azure Redis specific variable "evictedkeys_limit_threshold_warning" { description = "Evicted keys limit (warning threshold)" diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf index 950e9a1..57b3a6c 100644 --- a/cloud/azure/redis/monitors-azure-redis.tf +++ b/cloud/azure/redis/monitors-azure-redis.tf @@ -2,7 +2,7 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_redis:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } @@ -14,9 +14,9 @@ resource "datadog_monitor" "status" { avg(last_5m):avg:azure.cache_redis.status{${data.template_file.filter.rendered}} by {name,resource_group} != 1 EOF - type = "query alert" + type = "metric alert" - notify_no_data = false + notify_no_data = true evaluation_delay = "${var.delay}" renotify_interval = 0 notify_audit = false @@ -27,7 +27,7 @@ EOF new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:redis", "team:azure", "provider:azure"] } resource "datadog_monitor" "evictedkeys" { @@ -40,7 +40,7 @@ resource "datadog_monitor" "evictedkeys" { ) > ${var.evictedkeys_limit_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.evictedkeys_limit_threshold_warning}" @@ -58,7 +58,7 @@ EOF new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:redis", "team:azure", "provider:azure"] } resource "datadog_monitor" "percent_processor_time" { @@ -71,7 +71,7 @@ resource "datadog_monitor" "percent_processor_time" { ) > ${var.percent_processor_time_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.percent_processor_time_threshold_warning}" @@ -89,7 +89,7 @@ EOF new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:redis", "team:azure", "provider:azure"] } resource "datadog_monitor" "server_load" { @@ -102,7 +102,7 @@ resource "datadog_monitor" "server_load" { ) > ${var.server_load_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.server_load_rate_threshold_warning}" @@ -120,5 +120,5 @@ EOF new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:redis", "team:azure", "provider:azure"] } From 71d78bacdec6dd85a13fcee5c911088386aed37a Mon Sep 17 00:00:00 2001 From: Kevin Pecquet Date: Mon, 30 Oct 2017 15:48:26 +0100 Subject: [PATCH 53/93] MON-75 SQL DB monitors init --- cloud/azure/sql-database/README.md | 44 +++++++ cloud/azure/sql-database/inputs.tf | 49 ++++++++ .../monitors-sql-database-basics.tf | 109 ++++++++++++++++++ 3 files changed, 202 insertions(+) create mode 100644 cloud/azure/sql-database/README.md create mode 100644 cloud/azure/sql-database/inputs.tf create mode 100644 cloud/azure/sql-database/monitors-sql-database-basics.tf diff --git a/cloud/azure/sql-database/README.md b/cloud/azure/sql-database/README.md new file mode 100644 index 0000000..5fb0387 --- /dev/null +++ b/cloud/azure/sql-database/README.md @@ -0,0 +1,44 @@ +Azure SQL Database DataDog monitors +============================ + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-storage" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/sql-database?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + + environment = "${var.environment}" + client_name = "${var.client_name}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* CPU High +* Free disk space low +* DTU Consumption high +* SQL deadlocks + +Inputs +------ + +| Name | Type | Default | Required | +|------|:----:|:-------:|:--------:| +| client_name | Client name | string | - | yes | +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| environment | Architecture environment | string | - | yes | +| message | Message sent when a monitor is triggered | string | - | yes | +| use_filter_tags | Filter the data with service tags if true | string | `false` | no | +| dd_azure_sqldb | string | `disabled` | yes | +| cpu_threshold_warning | string | `85` | no | +| cpu_threshold_critical | string | `90` | no | +| diskspace_threshold_warning | string | `80` | no | +| diskspace_threshold_critical | string | `90` | no | +| dtu_threshold_warning | string | `85` | no | +| dtu_threshold_critical | string | `90` | no | +| deadlock_threshold_critical | string | `1` | no | diff --git a/cloud/azure/sql-database/inputs.tf b/cloud/azure/sql-database/inputs.tf new file mode 100644 index 0000000..77599b9 --- /dev/null +++ b/cloud/azure/sql-database/inputs.tf @@ -0,0 +1,49 @@ +variable "subscription_id" { + default = "" +} + +variable "message" { + description = "Message sent when a SQL DB monitor is triggered" +} + +variable "environment" {} + +variable "use_filter_tags" { + default = "false" +} + +variable "cpu_threshold_warning" { + default = "" +} + +variable "cpu_threshold_critical" { + default = "90" +} + +variable "diskspace_threshold_warning" { + default = "80" +} + +variable "diskspace_threshold_critical" { + default = "90" +} + +variable "dtu_threshold_warning" { + default = "85" +} + +variable "dtu_threshold_critical" { + default = "90" +} + +variable "deadlock_threshold_critical" { + default = "1" +} + +variable "delay" { + default = "600" +} + +variable "dd_azure_sqldb" { + default = "disabled" +} diff --git a/cloud/azure/sql-database/monitors-sql-database-basics.tf b/cloud/azure/sql-database/monitors-sql-database-basics.tf new file mode 100644 index 0000000..413e4020 --- /dev/null +++ b/cloud/azure/sql-database/monitors-sql-database-basics.tf @@ -0,0 +1,109 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_sqldb:enabled,env:%s",var.environment) : "*"}" + } +} + +resource "datadog_monitor" "sql-database_cpu_90_15min" { + name = "[${var.environment}] SQL Database CPU high > 90% for 15 min on {{name}}" + message = "${message}" + + count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }" + + query = "avg(last_15m):avg:azure.sql_servers_databases.cpu_percent{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.cpu_threshold_critical}" + type = "query alert" + + thresholds { + critical = "${var.cpu_threshold_critical}" + } + + notify_no_data = true + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "sql-database_free_space_low" { + name = "[${var.environment}] SQL Database free space < 10 % on {{name}}" + message = "${message}" + + type = "query alert" + query = "avg(last_15m):avg:azure.sql_servers_databases.storage_percent{${data.template_file.filter.rendered}} by {name,resource_group} > 90" + + count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }" + + thresholds { + warning = "${var.diskspace_threshold_warning}" + critical = "${var.diskspace_threshold_critical}" + } + + notify_no_data = true + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "sql-database_dtu_consumption_high" { + name = "[${var.environment}] DTU Consumption on {{name}} > 90" + message = "${message}" + + type = "query alert" + query = "avg(last_15m):azure.sql_servers_databases.dtu_consumption_percent{${data.template_file.filter.rendered}} by {name,resource_group} > 90" + + count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }" + + thresholds { + warning = "${var.dtu_threshold_warning}" + critical = "${var.dtu_threshold_critical}" + } + + notify_no_data = true + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "sql-database_deadlocks_count" { + name = "[${var.environment}] SQL Deadlocks too high on {{name}}" + message = "${message}" + + type = "query alert" + query = "sum(last_5m):avg:azure.sql_servers_databases.deadlock{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() > ${var.deadlock_threshold_critical}" + + count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }" + + thresholds { + critical = "${var.deadlock_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} From 220dfe019dec2656687f8eddb25a18146159cb06 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 7 Nov 2017 11:58:54 +0100 Subject: [PATCH 54/93] MON-75 Add some descriptions and add EOF on queries --- cloud/azure/sql-database/README.md | 29 +++++---- cloud/azure/sql-database/inputs.tf | 65 +++++++++++++------ .../monitors-sql-database-basics.tf | 36 ++++++---- 3 files changed, 84 insertions(+), 46 deletions(-) diff --git a/cloud/azure/sql-database/README.md b/cloud/azure/sql-database/README.md index 5fb0387..f135036 100644 --- a/cloud/azure/sql-database/README.md +++ b/cloud/azure/sql-database/README.md @@ -27,18 +27,19 @@ Creates a DataDog monitors with the following checks : Inputs ------ -| Name | Type | Default | Required | -|------|:----:|:-------:|:--------:| -| client_name | Client name | string | - | yes | +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | +| cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `` | no | +| deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | -| environment | Architecture environment | string | - | yes | -| message | Message sent when a monitor is triggered | string | - | yes | -| use_filter_tags | Filter the data with service tags if true | string | `false` | no | -| dd_azure_sqldb | string | `disabled` | yes | -| cpu_threshold_warning | string | `85` | no | -| cpu_threshold_critical | string | `90` | no | -| diskspace_threshold_warning | string | `80` | no | -| diskspace_threshold_critical | string | `90` | no | -| dtu_threshold_warning | string | `85` | no | -| dtu_threshold_critical | string | `90` | no | -| deadlock_threshold_critical | string | `1` | no | +| diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no | +| diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no | +| dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no | +| dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no | +| environment | Architecture Environment | string | - | yes | +| message | Message sent when an alert is triggered | string | - | yes | +| provider | Cloud provider which the monitor and its based metric depend on | string | `azure` | no | +| service | Service monitored by this set of monitors | string | `sql-database` | no | +| subscription_id | Azure account id used as filter for monitors | string | - | yes | +| use_filter_tags | Filter the data with service tags if true | string | `true` | no | diff --git a/cloud/azure/sql-database/inputs.tf b/cloud/azure/sql-database/inputs.tf index 77599b9..d3abe46 100644 --- a/cloud/azure/sql-database/inputs.tf +++ b/cloud/azure/sql-database/inputs.tf @@ -1,49 +1,74 @@ +# Global Terraform +variable "environment" { + description = "Architecture Environment" + type = "string" +} + variable "subscription_id" { - default = "" + description = "Azure account id used as filter for monitors" + type = "string" +} + +variable "provider" { + description = "Cloud provider which the monitor and its based metric depend on" + type = "string" + default = "azure" +} + +variable "service" { + description = "Service monitored by this set of monitors" + type = "string" + default = "sql-database" +} + +# Global DataDog +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 } variable "message" { - description = "Message sent when a SQL DB monitor is triggered" + description = "Message sent when an alert is triggered" } -variable "environment" {} - variable "use_filter_tags" { - default = "false" + description = "Filter the data with service tags if true" + default = "true" } +# Azure SQL Database specific + variable "cpu_threshold_warning" { - default = "" + description = "CPU usage in percent (warning threshold)" + default = "" } variable "cpu_threshold_critical" { - default = "90" + description = "CPU usage in percent (critical threshold)" + default = "90" } variable "diskspace_threshold_warning" { - default = "80" + description = "Disk space used in percent (warning threshold)" + default = "80" } variable "diskspace_threshold_critical" { - default = "90" + description = "Disk space used in percent (critical threshold)" + default = "90" } variable "dtu_threshold_warning" { - default = "85" + description = "Amount of DTU used (warning threshold)" + default = "85" } variable "dtu_threshold_critical" { - default = "90" + description = "Amount of DTU used (critical threshold)" + default = "90" } variable "deadlock_threshold_critical" { - default = "1" -} - -variable "delay" { - default = "600" -} - -variable "dd_azure_sqldb" { - default = "disabled" + description = "Amount of Deadlocks (critical threshold)" + default = "1" } diff --git a/cloud/azure/sql-database/monitors-sql-database-basics.tf b/cloud/azure/sql-database/monitors-sql-database-basics.tf index 413e4020..5448b8a 100644 --- a/cloud/azure/sql-database/monitors-sql-database-basics.tf +++ b/cloud/azure/sql-database/monitors-sql-database-basics.tf @@ -10,10 +10,13 @@ resource "datadog_monitor" "sql-database_cpu_90_15min" { name = "[${var.environment}] SQL Database CPU high > 90% for 15 min on {{name}}" message = "${message}" - count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }" + query = < ${var.cpu_threshold_critical} + EOF - query = "avg(last_15m):avg:azure.sql_servers_databases.cpu_percent{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.cpu_threshold_critical}" - type = "query alert" + type = "query alert" thresholds { critical = "${var.cpu_threshold_critical}" @@ -35,10 +38,13 @@ resource "datadog_monitor" "sql-database_free_space_low" { name = "[${var.environment}] SQL Database free space < 10 % on {{name}}" message = "${message}" - type = "query alert" - query = "avg(last_15m):avg:azure.sql_servers_databases.storage_percent{${data.template_file.filter.rendered}} by {name,resource_group} > 90" + type = "query alert" - count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }" + query = < ${var.diskspace_threshold_critical} + EOF thresholds { warning = "${var.diskspace_threshold_warning}" @@ -61,10 +67,13 @@ resource "datadog_monitor" "sql-database_dtu_consumption_high" { name = "[${var.environment}] DTU Consumption on {{name}} > 90" message = "${message}" - type = "query alert" - query = "avg(last_15m):azure.sql_servers_databases.dtu_consumption_percent{${data.template_file.filter.rendered}} by {name,resource_group} > 90" + type = "query alert" - count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }" + query = < ${var.dtu_threshold_critical} + EOF thresholds { warning = "${var.dtu_threshold_warning}" @@ -87,10 +96,13 @@ resource "datadog_monitor" "sql-database_deadlocks_count" { name = "[${var.environment}] SQL Deadlocks too high on {{name}}" message = "${message}" - type = "query alert" - query = "sum(last_5m):avg:azure.sql_servers_databases.deadlock{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() > ${var.deadlock_threshold_critical}" + type = "query alert" - count = "${var.dd_azure_sqldb == "enabled" ? 1 : 0 }" + query = < ${var.deadlock_threshold_critical} + EOF thresholds { critical = "${var.deadlock_threshold_critical}" From 6c5bdaa042c11f2a4217b3b3179f1001d407ea56 Mon Sep 17 00:00:00 2001 From: Marc-Antoine ADELISE Date: Mon, 30 Oct 2017 16:32:09 +0100 Subject: [PATCH 55/93] MON-74: Added first Azure App Services resources --- cloud/azure/app-services/inputs.tf | 107 ++++++++++++++++++ .../app-services/monitors-app_services.tf | 49 ++++++++ 2 files changed, 156 insertions(+) create mode 100644 cloud/azure/app-services/inputs.tf create mode 100644 cloud/azure/app-services/monitors-app_services.tf diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf new file mode 100644 index 0000000..830fcc2 --- /dev/null +++ b/cloud/azure/app-services/inputs.tf @@ -0,0 +1,107 @@ +variable "filter_tags" { + default = "*" +} + +################################### +### RESPONSE TIME VARIABLES ### +################################### +variable "response_time_appserv_eval_delay" { + default = 600 +} + +variable "response_time_critical_threshold" { + default = 0.8 + description = "Alerting threshold in seconds" +} + +variable "response_time_threshold_warning" { + default = 0.4 + description = "Warning threshold in seconds" +} + +variable "response_time_last_time_window_code" { + default = "1h" + description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" +} + +variable "response_time_tags" { + default = [] + description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" +} + +variable "response_time_timeout_h" { + default = false + description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." +} + +variable "response_time_include_tags" { + default = false + description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." +} + +variable "response_time_notify_no_data" { + default = true + description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +} + +variable "response_time_renotify_interval" { + default = 0 + description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." +} + +variable "response_time_escalation_message" { + default = "Escalation message @pagerduty" + description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." +} + +################################### +### MEMORY USAGE VARIABLES ### +################################### +variable "memory_usage_appserv_eval_delay" { + default = 600 +} + +variable "memory_usage_threshold_critical" { + default = 52430000 + description = "Alerting threshold in Mib" +} + +variable "memory_usage_threshold_warning" { + default = 33550000 + description = "Warning threshold in MiB" +} + +variable "memory_usage_last_time_window_code" { + default = "5m" + description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" +} + +variable "memory_usage_tags" { + default = [] + description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" +} + +variable "memory_usage_timeout_h" { + default = false + description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." +} + +variable "memory_usage_include_tags" { + default = false + description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." +} + +variable "memory_usage_notify_no_data" { + default = true + description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +} + +variable "memory_usage_renotify_interval" { + default = 0 + description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." +} + +variable "memory_usage_escalation_message" { + default = "Escalation message @pagerduty" + description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." +} diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf new file mode 100644 index 0000000..7bf1f99 --- /dev/null +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -0,0 +1,49 @@ +# Monitoring App Services response time +resource "datadog_monitor" "appservices_reponse_time" { + name = "[${var.environment}] App Services response time {{value}}s is above ${var.reponse_time_threshold_critical}s" + type = "query alert" + message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + escalation_message = "${var.reponse_time_escalation_message}" + + query = "avg(last_${var.reponse_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.reponse_time_threshold_critical}" + + evaluation_delay = "${var.reponse_time_appserv_eval_delay}" + + thresholds { + warning = "${var.reponse_time_threshold_warning}" + critical = "${var.reponse_time_threshold_critical}" + } + + notify_no_data = "${var.reponse_time_notify_no_data}" + renotify_interval = "${var.reponse_time_renotify_interval}" + + timeout_h = "${var.reponse_time_timeout_h}" + include_tags = "${var.reponse_time_include_tags}" + + tags = "${var.reponse_time_tags}" +} + +# Monitoring App Services memory usage +resource "datadog_monitor" "appservices_memory_usage" { + name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" + type = "query alert" + message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + escalation_message = "${var.memory_usage_escalation_message}" + + query = "avg(last_${var.memory_usage_last_time_window_code}):avg:azure.app_services.memory_working_set{*} >= ${var.memory_usage_threshold_critical}" + + evaluation_delay = "${var.memory_usage_appserv_eval_delay}" + + thresholds { + warning = "${var.memory_usage_threshold_warning}" + critical = "${var.memory_usage_threshold_critical}" + } + + notify_no_data = "${var.memory_usage_notify_no_data}" + renotify_interval = "${var.memory_usage_renotify_interval}" + + timeout_h = "${var.memory_usage_timeout_h}" + include_tags = "${var.memory_usage_include_tags}" + + tags = "${var.memory_usage_tags}" +} From 81df985f3297bcf3e993fef42f3a98146339bce0 Mon Sep 17 00:00:00 2001 From: Marc-Antoine ADELISE Date: Tue, 31 Oct 2017 10:08:19 +0100 Subject: [PATCH 56/93] MON-74: Response time, memory usage, http 404 status code and non 2xx http response status code percentage monitoring. --- cloud/azure/app-services/inputs.tf | 158 ++++++++++++++++-- .../app-services/monitors-app_services.tf | 81 +++++++-- 2 files changed, 215 insertions(+), 24 deletions(-) diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 830fcc2..666a394 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -1,7 +1,13 @@ -variable "filter_tags" { +variable "environment" {} + +variable "use_filter_tags" { default = "*" } +variable "critical_escalation_group" { + default = "HO_Dummy" +} + ################################### ### RESPONSE TIME VARIABLES ### ################################### @@ -9,7 +15,7 @@ variable "response_time_appserv_eval_delay" { default = 600 } -variable "response_time_critical_threshold" { +variable "response_time_threshold_critical" { default = 0.8 description = "Alerting threshold in seconds" } @@ -24,6 +30,11 @@ variable "response_time_last_time_window_code" { description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" } +variable "response_time_require_full_window" { + default = false + description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." +} + variable "response_time_tags" { default = [] description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" @@ -39,10 +50,10 @@ variable "response_time_include_tags" { description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." } -variable "response_time_notify_no_data" { - default = true - description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -} +# variable "response_time_notify_no_data" { +# default = true +# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +# } variable "response_time_renotify_interval" { default = 0 @@ -76,6 +87,11 @@ variable "memory_usage_last_time_window_code" { description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" } +variable "memory_usage_require_full_window" { + default = false + description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." +} + variable "memory_usage_tags" { default = [] description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" @@ -91,10 +107,10 @@ variable "memory_usage_include_tags" { description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." } -variable "memory_usage_notify_no_data" { - default = true - description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -} +# variable "memory_usage_notify_no_data" { +# default = true +# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +# } variable "memory_usage_renotify_interval" { default = 0 @@ -105,3 +121,125 @@ variable "memory_usage_escalation_message" { default = "Escalation message @pagerduty" description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." } + +################################# +### HTTP 404 status pages ### +################################# +variable "http_404_errors_count_rate_limit" { + default = 30 +} + +variable "http_404_errors_count_rate_appserv_eval_delay" { + default = 600 +} + +variable "http_404_errors_count_rate_threshold_critical" { + default = 30 + description = "Alerting threshold (number of requests)" +} + +variable "http_404_errors_count_rate_threshold_warning" { + default = 10 + description = "Warning threshold (number of requests)" +} + +variable "http_404_errors_count_rate_last_time_window_code" { + default = "5m" + description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" +} + +variable "http_404_errors_count_rate_require_full_window" { + default = true + description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." +} + +variable "http_404_errors_count_rate_tags" { + default = [] + description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" +} + +variable "http_404_errors_count_rate_timeout_h" { + default = false + description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." +} + +variable "http_404_errors_count_rate_include_tags" { + default = false + description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." +} + +# variable "http_404_errors_count_rate_notify_no_data" { +# default = true +# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +# } + +variable "http_404_errors_count_rate_renotify_interval" { + default = 0 + description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." +} + +variable "http_404_errors_count_rate_escalation_message" { + default = "Escalation message @pagerduty" + description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." +} + +################################# +### HTTP 202 status pages ### +################################# +variable "http_2xx_status_rate_limit" { + default = 30 +} + +variable "http_2xx_status_rate_appserv_eval_delay" { + default = 600 +} + +variable "http_2xx_status_rate_threshold_critical" { + default = 0.9 + description = "Alerting threshold (percentage)" +} + +variable "http_2xx_status_rate_threshold_warning" { + default = 0.95 + description = "Warning threshold (percentage)" +} + +variable "http_2xx_status_rate_last_time_window_code" { + default = "5m" + description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" +} + +variable "http_2xx_status_rate_require_full_window" { + default = true + description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." +} + +variable "http_2xx_status_rate_tags" { + default = [] + description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" +} + +variable "http_2xx_status_rate_timeout_h" { + default = false + description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." +} + +variable "http_2xx_status_rate_include_tags" { + default = false + description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." +} + +# variable "http_2xx_status_rate_notify_no_data" { +# default = true +# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." +# } + +variable "http_2xx_status_rate_renotify_interval" { + default = 0 + description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." +} + +variable "http_2xx_status_rate_escalation_message" { + default = "Escalation message @pagerduty" + description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." +} diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 7bf1f99..892b2c4 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -1,30 +1,31 @@ # Monitoring App Services response time -resource "datadog_monitor" "appservices_reponse_time" { - name = "[${var.environment}] App Services response time {{value}}s is above ${var.reponse_time_threshold_critical}s" +resource "datadog_monitor" "appservices_response_time" { + name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" type = "query alert" message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" - escalation_message = "${var.reponse_time_escalation_message}" + escalation_message = "${var.response_time_escalation_message}" - query = "avg(last_${var.reponse_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.reponse_time_threshold_critical}" + query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.response_time_threshold_critical}" - evaluation_delay = "${var.reponse_time_appserv_eval_delay}" + evaluation_delay = "${var.response_time_appserv_eval_delay}" thresholds { - warning = "${var.reponse_time_threshold_warning}" - critical = "${var.reponse_time_threshold_critical}" + warning = "${var.response_time_threshold_warning}" + critical = "${var.response_time_threshold_critical}" } - notify_no_data = "${var.reponse_time_notify_no_data}" - renotify_interval = "${var.reponse_time_renotify_interval}" + notify_no_data = true # Will notify when no data is received + renotify_interval = "${var.response_time_renotify_interval}" + require_full_window = "${var.response_time_require_full_window}" - timeout_h = "${var.reponse_time_timeout_h}" - include_tags = "${var.reponse_time_include_tags}" + timeout_h = "${var.response_time_timeout_h}" + include_tags = "${var.response_time_include_tags}" - tags = "${var.reponse_time_tags}" + tags = "${var.response_time_tags}" } # Monitoring App Services memory usage -resource "datadog_monitor" "appservices_memory_usage" { +resource "datadog_monitor" "appservices_memory_usage_count" { name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" type = "query alert" message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" @@ -39,11 +40,63 @@ resource "datadog_monitor" "appservices_memory_usage" { critical = "${var.memory_usage_threshold_critical}" } - notify_no_data = "${var.memory_usage_notify_no_data}" + notify_no_data = true # Will notify when no data is received renotify_interval = "${var.memory_usage_renotify_interval}" + require_full_window = "${var.memory_usage_require_full_window}" timeout_h = "${var.memory_usage_timeout_h}" include_tags = "${var.memory_usage_include_tags}" tags = "${var.memory_usage_tags}" } + +# Monitoring App Services 404 errors rate +resource "datadog_monitor" "appservices_http_404_errors_count" { + name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit" + type = "query alert" + message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + escalation_message = "${var.http_404_errors_count_rate_escalation_message}" + + query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{*}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}" + + evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}" + + thresholds { + warning = "${var.http_404_errors_count_rate_threshold_warning}" + critical = "${var.http_404_errors_count_rate_threshold_critical}" + } + + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}" + require_full_window = true + + timeout_h = "${var.http_404_errors_count_rate_timeout_h}" + include_tags = "${var.http_404_errors_count_rate_include_tags}" + + tags = "${var.http_404_errors_count_rate_tags}" +} + +# Monitoring App Services HTTP 2xx status pages rate +resource "datadog_monitor" "appservices_http_2xx_status_rate" { + name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests" + type = "query alert" + message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + escalation_message = "${var.http_2xx_status_rate_escalation_message}" + + query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}" + evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}" + + thresholds { + warning = "${var.http_2xx_status_rate_threshold_warning}" + critical = "${var.http_2xx_status_rate_threshold_critical}" + } + + notify_no_data = true # Will notify when no data is received + renotify_interval = "${var.http_2xx_status_rate_renotify_interval}" + require_full_window = true + + timeout_h = "${var.http_2xx_status_rate_timeout_h}" + include_tags = "${var.http_2xx_status_rate_include_tags}" + + tags = "${var.http_2xx_status_rate_tags}" +} From 58bbe0bc7bd08c92c26719b839d185f2e682c54f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 11:18:52 +0100 Subject: [PATCH 57/93] MON-74: fmt --- cloud/azure/app-services/inputs.tf | 72 +++++++++---------- .../app-services/monitors-app_services.tf | 18 ++--- 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 666a394..dc26017 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -16,37 +16,37 @@ variable "response_time_appserv_eval_delay" { } variable "response_time_threshold_critical" { - default = 0.8 + default = 0.8 description = "Alerting threshold in seconds" } variable "response_time_threshold_warning" { - default = 0.4 + default = 0.4 description = "Warning threshold in seconds" } variable "response_time_last_time_window_code" { - default = "1h" + default = "1h" description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" } variable "response_time_require_full_window" { - default = false + default = false description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." } variable "response_time_tags" { - default = [] + default = [] description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" } variable "response_time_timeout_h" { - default = false + default = false description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } variable "response_time_include_tags" { - default = false + default = false description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." } @@ -56,12 +56,12 @@ variable "response_time_include_tags" { # } variable "response_time_renotify_interval" { - default = 0 + default = 0 description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." } variable "response_time_escalation_message" { - default = "Escalation message @pagerduty" + default = "Escalation message @pagerduty" description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." } @@ -73,37 +73,37 @@ variable "memory_usage_appserv_eval_delay" { } variable "memory_usage_threshold_critical" { - default = 52430000 + default = 52430000 description = "Alerting threshold in Mib" } variable "memory_usage_threshold_warning" { - default = 33550000 + default = 33550000 description = "Warning threshold in MiB" } variable "memory_usage_last_time_window_code" { - default = "5m" + default = "5m" description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" } variable "memory_usage_require_full_window" { - default = false + default = false description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." } variable "memory_usage_tags" { - default = [] + default = [] description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" } variable "memory_usage_timeout_h" { - default = false + default = false description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } variable "memory_usage_include_tags" { - default = false + default = false description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." } @@ -113,12 +113,12 @@ variable "memory_usage_include_tags" { # } variable "memory_usage_renotify_interval" { - default = 0 + default = 0 description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." } variable "memory_usage_escalation_message" { - default = "Escalation message @pagerduty" + default = "Escalation message @pagerduty" description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." } @@ -134,37 +134,37 @@ variable "http_404_errors_count_rate_appserv_eval_delay" { } variable "http_404_errors_count_rate_threshold_critical" { - default = 30 + default = 30 description = "Alerting threshold (number of requests)" } variable "http_404_errors_count_rate_threshold_warning" { - default = 10 + default = 10 description = "Warning threshold (number of requests)" } variable "http_404_errors_count_rate_last_time_window_code" { - default = "5m" + default = "5m" description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" } variable "http_404_errors_count_rate_require_full_window" { - default = true + default = true description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." } variable "http_404_errors_count_rate_tags" { - default = [] + default = [] description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" } variable "http_404_errors_count_rate_timeout_h" { - default = false + default = false description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } variable "http_404_errors_count_rate_include_tags" { - default = false + default = false description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." } @@ -174,12 +174,12 @@ variable "http_404_errors_count_rate_include_tags" { # } variable "http_404_errors_count_rate_renotify_interval" { - default = 0 + default = 0 description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." } variable "http_404_errors_count_rate_escalation_message" { - default = "Escalation message @pagerduty" + default = "Escalation message @pagerduty" description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." } @@ -195,37 +195,37 @@ variable "http_2xx_status_rate_appserv_eval_delay" { } variable "http_2xx_status_rate_threshold_critical" { - default = 0.9 + default = 0.9 description = "Alerting threshold (percentage)" } variable "http_2xx_status_rate_threshold_warning" { - default = 0.95 + default = 0.95 description = "Warning threshold (percentage)" } variable "http_2xx_status_rate_last_time_window_code" { - default = "5m" + default = "5m" description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" } variable "http_2xx_status_rate_require_full_window" { - default = true + default = true description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." } variable "http_2xx_status_rate_tags" { - default = [] + default = [] description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" } variable "http_2xx_status_rate_timeout_h" { - default = false + default = false description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } variable "http_2xx_status_rate_include_tags" { - default = false + default = false description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." } @@ -235,11 +235,11 @@ variable "http_2xx_status_rate_include_tags" { # } variable "http_2xx_status_rate_renotify_interval" { - default = 0 + default = 0 description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." } variable "http_2xx_status_rate_escalation_message" { - default = "Escalation message @pagerduty" + default = "Escalation message @pagerduty" description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." } diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 892b2c4..3e5f94a 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -14,8 +14,8 @@ resource "datadog_monitor" "appservices_response_time" { critical = "${var.response_time_threshold_critical}" } - notify_no_data = true # Will notify when no data is received - renotify_interval = "${var.response_time_renotify_interval}" + notify_no_data = true # Will notify when no data is received + renotify_interval = "${var.response_time_renotify_interval}" require_full_window = "${var.response_time_require_full_window}" timeout_h = "${var.response_time_timeout_h}" @@ -40,8 +40,8 @@ resource "datadog_monitor" "appservices_memory_usage_count" { critical = "${var.memory_usage_threshold_critical}" } - notify_no_data = true # Will notify when no data is received - renotify_interval = "${var.memory_usage_renotify_interval}" + notify_no_data = true # Will notify when no data is received + renotify_interval = "${var.memory_usage_renotify_interval}" require_full_window = "${var.memory_usage_require_full_window}" timeout_h = "${var.memory_usage_timeout_h}" @@ -66,8 +66,8 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { critical = "${var.http_404_errors_count_rate_threshold_critical}" } - notify_no_data = false # Will NOT notify when no data is received - renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}" + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}" require_full_window = true timeout_h = "${var.http_404_errors_count_rate_timeout_h}" @@ -83,7 +83,7 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" escalation_message = "${var.http_2xx_status_rate_escalation_message}" - query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}" + query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}" evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}" thresholds { @@ -91,8 +91,8 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { critical = "${var.http_2xx_status_rate_threshold_critical}" } - notify_no_data = true # Will notify when no data is received - renotify_interval = "${var.http_2xx_status_rate_renotify_interval}" + notify_no_data = true # Will notify when no data is received + renotify_interval = "${var.http_2xx_status_rate_renotify_interval}" require_full_window = true timeout_h = "${var.http_2xx_status_rate_timeout_h}" From 4c9bc13de0ae6365d94a3a3d311a8f3339b5bd09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 11:22:18 +0100 Subject: [PATCH 58/93] MON-74: Use filter tags option --- cloud/azure/app-services/inputs.tf | 3 ++- .../app-services/monitors-app_services.tf | 19 ++++++++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index dc26017..8af09cb 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -1,7 +1,8 @@ variable "environment" {} variable "use_filter_tags" { - default = "*" + description = "Filter the data with service tags if true" + default = "true" } variable "critical_escalation_group" { diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 3e5f94a..48b8184 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -1,3 +1,11 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_appservices:enabled,env:%s", var.environment) : "*"}" + } +} + # Monitoring App Services response time resource "datadog_monitor" "appservices_response_time" { name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" @@ -5,7 +13,7 @@ resource "datadog_monitor" "appservices_response_time" { message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" escalation_message = "${var.response_time_escalation_message}" - query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{*} >= ${var.response_time_threshold_critical}" + query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}} >= ${var.response_time_threshold_critical}" evaluation_delay = "${var.response_time_appserv_eval_delay}" @@ -31,7 +39,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" { message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" escalation_message = "${var.memory_usage_escalation_message}" - query = "avg(last_${var.memory_usage_last_time_window_code}):avg:azure.app_services.memory_working_set{*} >= ${var.memory_usage_threshold_critical}" + query = "avg(last_${var.memory_usage_last_time_window_code}):avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}} >= ${var.memory_usage_threshold_critical}" evaluation_delay = "${var.memory_usage_appserv_eval_delay}" @@ -57,7 +65,7 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" escalation_message = "${var.http_404_errors_count_rate_escalation_message}" - query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{*}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}" + query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}" evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}" @@ -83,7 +91,7 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" escalation_message = "${var.http_2xx_status_rate_escalation_message}" - query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{*}.as_count() / avg:azure.app_services.http2xx{*}.as_count() < ${var.http_2xx_status_rate_threshold_critical}" + query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() / avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() < ${var.http_2xx_status_rate_threshold_critical}" evaluation_delay = "${var.http_2xx_status_rate_appserv_eval_delay}" thresholds { @@ -91,7 +99,8 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { critical = "${var.http_2xx_status_rate_threshold_critical}" } - notify_no_data = true # Will notify when no data is received + # Will notify when no data is received + notify_no_data = true renotify_interval = "${var.http_2xx_status_rate_renotify_interval}" require_full_window = true From ac96ee6586a2800c13dde2b25ac456c5d695d15b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 11:24:08 +0100 Subject: [PATCH 59/93] MON-74: Uses generic message parameter --- cloud/azure/app-services/inputs.tf | 12 ++++++++---- cloud/azure/app-services/monitors-app_services.tf | 8 ++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 8af09cb..4ad908b 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -1,19 +1,23 @@ -variable "environment" {} +variable "environment" { + description = "Architecture environment" + type = "string" +} variable "use_filter_tags" { description = "Filter the data with service tags if true" default = "true" } -variable "critical_escalation_group" { - default = "HO_Dummy" +variable "message" { + description = "Message sent when a monitor is triggered" } ################################### ### RESPONSE TIME VARIABLES ### ################################### variable "response_time_appserv_eval_delay" { - default = 600 + description = "Delay in seconds for the metric evaluation" + default = 600 } variable "response_time_threshold_critical" { diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 48b8184..9447cb4 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -10,7 +10,7 @@ data "template_file" "filter" { resource "datadog_monitor" "appservices_response_time" { name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" type = "query alert" - message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + message = "${var.message}" escalation_message = "${var.response_time_escalation_message}" query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}} >= ${var.response_time_threshold_critical}" @@ -36,7 +36,7 @@ resource "datadog_monitor" "appservices_response_time" { resource "datadog_monitor" "appservices_memory_usage_count" { name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" type = "query alert" - message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + message = "${var.message}" escalation_message = "${var.memory_usage_escalation_message}" query = "avg(last_${var.memory_usage_last_time_window_code}):avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}} >= ${var.memory_usage_threshold_critical}" @@ -62,7 +62,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" { resource "datadog_monitor" "appservices_http_404_errors_count" { name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit" type = "query alert" - message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + message = "${var.message}" escalation_message = "${var.http_404_errors_count_rate_escalation_message}" query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}" @@ -88,7 +88,7 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { resource "datadog_monitor" "appservices_http_2xx_status_rate" { name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests" type = "query alert" - message = "{{#is_alert}}${var.critical_escalation_group}{{/is_alert}}{{#is_recovery}}${var.critical_escalation_group}{{/is_recovery}}" + message = "${var.message}" escalation_message = "${var.http_2xx_status_rate_escalation_message}" query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() / avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() < ${var.http_2xx_status_rate_threshold_critical}" From 31e036a8055c1404cb6b74808a701652fef42c04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 11:27:09 +0100 Subject: [PATCH 60/93] MON-74: Readme --- cloud/azure/app-services/README.md | 83 ++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 cloud/azure/app-services/README.md diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md new file mode 100644 index 0000000..443c819 --- /dev/null +++ b/cloud/azure/app-services/README.md @@ -0,0 +1,83 @@ +Azure AppServices (Web, API, Functions) DataDog monitors +======================================================== + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-app-services" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/app-services?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + + environment = "${var.environment}" + client_name = "${var.client_name}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* Response time +* Memory usage count +* HTTP 404 errors +* HTTP 50x errors +* HTTP 20x rate + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| environment | Architecture environment | string | - | yes | +| http_2xx_status_rate_appserv_eval_delay | | string | `600` | no | +| http_2xx_status_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| http_2xx_status_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| http_2xx_status_rate_limit | ################################ ## HTTP 202 status pages ### ################################ | string | `30` | no | +| http_2xx_status_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | +| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | +| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | +| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| http_404_errors_count_rate_appserv_eval_delay | | string | `600` | no | +| http_404_errors_count_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| http_404_errors_count_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| http_404_errors_count_rate_limit | ################################ ## HTTP 404 status pages ### ################################ | string | `30` | no | +| http_404_errors_count_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | +| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | +| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | +| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| memory_usage_appserv_eval_delay | ################################## ## MEMORY USAGE VARIABLES ### ################################## | string | `600` | no | +| memory_usage_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| memory_usage_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| memory_usage_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | +| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | +| memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | +| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| response_time_appserv_eval_delay | Delay in seconds for the metric evaluation | string | `600` | no | +| response_time_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| response_time_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no | +| response_time_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | +| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | +| response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | +| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| use_filter_tags | Filter the data with service tags if true | string | `true` | no | + + +Related documentation +--------------------- + +DataDog documentation: https://docs.datadoghq.com/integrations/azure_app_services From 98f5b6f331f381b9c7300f12036ac34320d0a718 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 11:31:55 +0100 Subject: [PATCH 61/93] MON-74: Readme update --- cloud/azure/app-services/README.md | 165 ++++++++++++++--------------- cloud/azure/app-services/inputs.tf | 4 + 2 files changed, 86 insertions(+), 83 deletions(-) diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index 443c819..d8a02c7 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -1,83 +1,82 @@ -Azure AppServices (Web, API, Functions) DataDog monitors -======================================================== - -How to use this module ----------------------- - -``` -module "datadog-monitors-azure-app-services" { - source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/app-services?ref={revision}" - - message = "${module.datadog-message-alerting.alerting-message}" - - environment = "${var.environment}" - client_name = "${var.client_name}" -} -``` - -Purpose -------- -Creates a DataDog monitors with the following checks : - -* Response time -* Memory usage count -* HTTP 404 errors -* HTTP 50x errors -* HTTP 20x rate - -Inputs ------- - -| Name | Description | Type | Default | Required | -|------|-------------|:----:|:-----:|:-----:| -| environment | Architecture environment | string | - | yes | -| http_2xx_status_rate_appserv_eval_delay | | string | `600` | no | -| http_2xx_status_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| http_2xx_status_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | -| http_2xx_status_rate_limit | ################################ ## HTTP 202 status pages ### ################################ | string | `30` | no | -| http_2xx_status_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | -| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | -| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | -| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | -| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | -| http_404_errors_count_rate_appserv_eval_delay | | string | `600` | no | -| http_404_errors_count_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| http_404_errors_count_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | -| http_404_errors_count_rate_limit | ################################ ## HTTP 404 status pages ### ################################ | string | `30` | no | -| http_404_errors_count_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | -| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | -| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | -| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | -| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | -| memory_usage_appserv_eval_delay | ################################## ## MEMORY USAGE VARIABLES ### ################################## | string | `600` | no | -| memory_usage_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| memory_usage_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | -| memory_usage_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | -| memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | -| memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | -| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | -| message | Message sent when a monitor is triggered | string | - | yes | -| response_time_appserv_eval_delay | Delay in seconds for the metric evaluation | string | `600` | no | -| response_time_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| response_time_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no | -| response_time_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | -| response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | -| response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | -| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | -| use_filter_tags | Filter the data with service tags if true | string | `true` | no | - - -Related documentation ---------------------- - -DataDog documentation: https://docs.datadoghq.com/integrations/azure_app_services +Azure AppServices (Web, API, Functions) DataDog monitors +======================================================== + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-app-services" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/app-services?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + + environment = "${var.environment}" + client_name = "${var.client_name}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* Response time +* Memory usage count +* HTTP 404 errors +* HTTP 50x errors +* HTTP 20x rate + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| environment | Architecture environment | string | - | yes | +| http_2xx_status_rate_appserv_eval_delay | | string | `600` | no | +| http_2xx_status_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| http_2xx_status_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| http_2xx_status_rate_limit | | string | `30` | no | +| http_2xx_status_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | +| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | +| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | +| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| http_404_errors_count_rate_appserv_eval_delay | | string | `600` | no | +| http_404_errors_count_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| http_404_errors_count_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| http_404_errors_count_rate_limit | | string | `30` | no | +| http_404_errors_count_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | +| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | +| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | +| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| memory_usage_appserv_eval_delay | | string | `600` | no | +| memory_usage_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| memory_usage_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| memory_usage_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | +| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | +| memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | +| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| response_time_appserv_eval_delay | Delay in seconds for the metric evaluation | string | `600` | no | +| response_time_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | +| response_time_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | +| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no | +| response_time_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | +| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | +| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | +| response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | +| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| use_filter_tags | Filter the data with service tags if true | string | `true` | no | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/azure_app_services](https://docs.datadoghq.com/integrations/azure_app_services) diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 4ad908b..4f2a693 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -15,6 +15,7 @@ variable "message" { ################################### ### RESPONSE TIME VARIABLES ### ################################### + variable "response_time_appserv_eval_delay" { description = "Delay in seconds for the metric evaluation" default = 600 @@ -73,6 +74,7 @@ variable "response_time_escalation_message" { ################################### ### MEMORY USAGE VARIABLES ### ################################### + variable "memory_usage_appserv_eval_delay" { default = 600 } @@ -130,6 +132,7 @@ variable "memory_usage_escalation_message" { ################################# ### HTTP 404 status pages ### ################################# + variable "http_404_errors_count_rate_limit" { default = 30 } @@ -191,6 +194,7 @@ variable "http_404_errors_count_rate_escalation_message" { ################################# ### HTTP 202 status pages ### ################################# + variable "http_2xx_status_rate_limit" { default = 30 } From dc06fb9519175c55c6d12b60ceea20a71ac4af0e Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 31 Oct 2017 15:28:41 +0100 Subject: [PATCH 62/93] MON-74 Add EOF on querys --- .../app-services/monitors-app_services.tf | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 9447cb4..c42ad6c 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -13,7 +13,11 @@ resource "datadog_monitor" "appservices_response_time" { message = "${var.message}" escalation_message = "${var.response_time_escalation_message}" - query = "avg(last_${var.response_time_last_time_window_code}):avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}} >= ${var.response_time_threshold_critical}" + query = <= ${var.response_time_threshold_critical} + EOF evaluation_delay = "${var.response_time_appserv_eval_delay}" @@ -39,7 +43,11 @@ resource "datadog_monitor" "appservices_memory_usage_count" { message = "${var.message}" escalation_message = "${var.memory_usage_escalation_message}" - query = "avg(last_${var.memory_usage_last_time_window_code}):avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}} >= ${var.memory_usage_threshold_critical}" + query = <= ${var.memory_usage_threshold_critical} + EOF evaluation_delay = "${var.memory_usage_appserv_eval_delay}" @@ -65,7 +73,11 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { message = "${var.message}" escalation_message = "${var.http_404_errors_count_rate_escalation_message}" - query = "max(last_${var.http_404_errors_count_rate_last_time_window_code}):per_minute(avg:azure.app_services.http404{${data.template_file.filter.rendered}}.as_rate()) > ${var.http_404_errors_count_rate_threshold_critical}" + query = < ${var.http_404_errors_count_rate_threshold_critical} + EOF evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}" @@ -91,7 +103,13 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { message = "${var.message}" escalation_message = "${var.http_2xx_status_rate_escalation_message}" - query = "avg(last_${var.http_2xx_status_rate_last_time_window_code}):avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() / avg:azure.app_services.http2xx{${data.template_file.filter.rendered}}.as_count() < ${var.http_2xx_status_rate_threshold_critical}" + query = < Date: Thu, 2 Nov 2017 16:54:18 +0100 Subject: [PATCH 63/93] MON-74 Fix changes to fit as the other modules --- cloud/azure/app-services/README.md | 80 +++++++++-------- cloud/azure/app-services/inputs.tf | 86 +++---------------- .../app-services/monitors-app_services.tf | 60 ++++++------- 3 files changed, 86 insertions(+), 140 deletions(-) diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index d8a02c7..90f5882 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -28,52 +28,64 @@ Creates a DataDog monitors with the following checks : Inputs ------ -| Name | Description | Type | Default | Required | -|------|-------------|:----:|:-----:|:-----:| +| Name | Description | Type | Default | Required | DESKTOP-0PBDRFR: ~ +|------|-------------|:----:|:-----:|:-----:| → +| client_name | Client Name | string | - | yes | +| delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| http_2xx_status_rate_appserv_eval_delay | | string | `600` | no | -| http_2xx_status_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| http_2xx_status_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_# +m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | | http_2xx_status_rate_limit | | string | `30` | no | -| http_2xx_status_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | -| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data bef +ore it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be s +kipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` +| no | +| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter m +onitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying + via the API | string | `` | no | | http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | | http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | -| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | -| http_404_errors_count_rate_appserv_eval_delay | | string | `600` | no | -| http_404_errors_count_rate_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| http_404_errors_count_rate_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically +resolve from a triggered state. Defaults to false. | string | `false` | no | +| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write +last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | | http_404_errors_count_rate_limit | | string | `30` | no | -| http_404_errors_count_rate_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` | no | -| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of da +ta before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations wil +l be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | ` +true` | no | +| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and fi +lter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when qu +erying via the API | string | `` | no | | http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | | http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | -| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | -| memory_usage_appserv_eval_delay | | string | `600` | no | -| memory_usage_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| memory_usage_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | -| memory_usage_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automati +cally resolve from a triggered state. Defaults to false. | string | `false` | no | +| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, + 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's + evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. +Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | +| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors +in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the + API | string | `` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | -| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve +from a triggered state. Defaults to false. | string | `false` | no | | message | Message sent when a monitor is triggered | string | - | yes | -| response_time_appserv_eval_delay | Delay in seconds for the metric evaluation | string | `600` | no | -| response_time_escalation_message | A message to include with a re-notification. Supports the '@username' notification allowed elsewhere. | string | `Escalation message @pagerduty` | no | -| response_time_include_tags | A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true. | string | `false` | no | -| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no | -| response_time_renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved. | string | `0` | no | -| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API | string | `` | no | +| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5 +, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no | +| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it' +s evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. + Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | +| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors + in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via th +e API | string | `` | no | | response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | | response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | -| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false. | string | `false` | no | +| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve + from a triggered state. Defaults to false. | string | `false` | no | | use_filter_tags | Filter the data with service tags if true | string | `true` | no | Related documentation diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 4f2a693..5f0f2b0 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -3,6 +3,11 @@ variable "environment" { type = "string" } +variable "client_name" { + description = "Client Name" + type = "string" +} + variable "use_filter_tags" { description = "Filter the data with service tags if true" default = "true" @@ -12,15 +17,15 @@ variable "message" { description = "Message sent when a monitor is triggered" } -################################### -### RESPONSE TIME VARIABLES ### -################################### - -variable "response_time_appserv_eval_delay" { +variable "delay" { description = "Delay in seconds for the metric evaluation" default = 600 } +################################### +### RESPONSE TIME VARIABLES ### +################################### + variable "response_time_threshold_critical" { default = 0.8 description = "Alerting threshold in seconds" @@ -51,34 +56,15 @@ variable "response_time_timeout_h" { description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } -variable "response_time_include_tags" { - default = false - description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." -} - # variable "response_time_notify_no_data" { # default = true # description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." # } -variable "response_time_renotify_interval" { - default = 0 - description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." -} - -variable "response_time_escalation_message" { - default = "Escalation message @pagerduty" - description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." -} - ################################### ### MEMORY USAGE VARIABLES ### ################################### -variable "memory_usage_appserv_eval_delay" { - default = 600 -} - variable "memory_usage_threshold_critical" { default = 52430000 description = "Alerting threshold in Mib" @@ -109,26 +95,11 @@ variable "memory_usage_timeout_h" { description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } -variable "memory_usage_include_tags" { - default = false - description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." -} - # variable "memory_usage_notify_no_data" { # default = true # description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." # } -variable "memory_usage_renotify_interval" { - default = 0 - description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." -} - -variable "memory_usage_escalation_message" { - default = "Escalation message @pagerduty" - description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." -} - ################################# ### HTTP 404 status pages ### ################################# @@ -137,10 +108,6 @@ variable "http_404_errors_count_rate_limit" { default = 30 } -variable "http_404_errors_count_rate_appserv_eval_delay" { - default = 600 -} - variable "http_404_errors_count_rate_threshold_critical" { default = 30 description = "Alerting threshold (number of requests)" @@ -171,26 +138,11 @@ variable "http_404_errors_count_rate_timeout_h" { description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } -variable "http_404_errors_count_rate_include_tags" { - default = false - description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." -} - # variable "http_404_errors_count_rate_notify_no_data" { # default = true # description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." # } -variable "http_404_errors_count_rate_renotify_interval" { - default = 0 - description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." -} - -variable "http_404_errors_count_rate_escalation_message" { - default = "Escalation message @pagerduty" - description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." -} - ################################# ### HTTP 202 status pages ### ################################# @@ -199,10 +151,6 @@ variable "http_2xx_status_rate_limit" { default = 30 } -variable "http_2xx_status_rate_appserv_eval_delay" { - default = 600 -} - variable "http_2xx_status_rate_threshold_critical" { default = 0.9 description = "Alerting threshold (percentage)" @@ -233,22 +181,8 @@ variable "http_2xx_status_rate_timeout_h" { description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." } -variable "http_2xx_status_rate_include_tags" { - default = false - description = "A boolean indicating whether notifications from this monitor will automatically insert its triggering tags into the title. Defaults to true." -} - # variable "http_2xx_status_rate_notify_no_data" { # default = true # description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." # } -variable "http_2xx_status_rate_renotify_interval" { - default = 0 - description = "The number of minutes after the last notification before a monitor will re-notify on the current status. It will only re-notify if it's not resolved." -} - -variable "http_2xx_status_rate_escalation_message" { - default = "Escalation message @pagerduty" - description = "A message to include with a re-notification. Supports the '@username' notification allowed elsewhere." -} diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index c42ad6c..437b7fb 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -8,10 +8,9 @@ data "template_file" "filter" { # Monitoring App Services response time resource "datadog_monitor" "appservices_response_time" { - name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" - type = "query alert" - message = "${var.message}" - escalation_message = "${var.response_time_escalation_message}" + name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" + type = "query alert" + message = "${var.message}" query = <= ${var.response_time_threshold_critical} EOF - evaluation_delay = "${var.response_time_appserv_eval_delay}" + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" thresholds { warning = "${var.response_time_threshold_warning}" @@ -27,21 +27,20 @@ resource "datadog_monitor" "appservices_response_time" { } notify_no_data = true # Will notify when no data is received - renotify_interval = "${var.response_time_renotify_interval}" + renotify_interval = 0 require_full_window = "${var.response_time_require_full_window}" timeout_h = "${var.response_time_timeout_h}" - include_tags = "${var.response_time_include_tags}" + include_tags = true tags = "${var.response_time_tags}" } # Monitoring App Services memory usage resource "datadog_monitor" "appservices_memory_usage_count" { - name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" - type = "query alert" - message = "${var.message}" - escalation_message = "${var.memory_usage_escalation_message}" + name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" + type = "query alert" + message = "${var.message}" query = <= ${var.memory_usage_threshold_critical} EOF - evaluation_delay = "${var.memory_usage_appserv_eval_delay}" + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" thresholds { warning = "${var.memory_usage_threshold_warning}" @@ -57,21 +57,20 @@ resource "datadog_monitor" "appservices_memory_usage_count" { } notify_no_data = true # Will notify when no data is received - renotify_interval = "${var.memory_usage_renotify_interval}" + renotify_interval = 0 require_full_window = "${var.memory_usage_require_full_window}" timeout_h = "${var.memory_usage_timeout_h}" - include_tags = "${var.memory_usage_include_tags}" + include_tags = true tags = "${var.memory_usage_tags}" } # Monitoring App Services 404 errors rate resource "datadog_monitor" "appservices_http_404_errors_count" { - name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit" - type = "query alert" - message = "${var.message}" - escalation_message = "${var.http_404_errors_count_rate_escalation_message}" + name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit" + type = "query alert" + message = "${var.message}" query = < ${var.http_404_errors_count_rate_threshold_critical} EOF - evaluation_delay = "${var.http_404_errors_count_rate_appserv_eval_delay}" + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" thresholds { warning = "${var.http_404_errors_count_rate_threshold_warning}" critical = "${var.http_404_errors_count_rate_threshold_critical}" } - notify_no_data = false # Will NOT notify when no data is received - renotify_interval = "${var.http_404_errors_count_rate_renotify_interval}" + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = 0 require_full_window = true timeout_h = "${var.http_404_errors_count_rate_timeout_h}" - include_tags = "${var.http_404_errors_count_rate_include_tags}" + include_tags = true tags = "${var.http_404_errors_count_rate_tags}" } # Monitoring App Services HTTP 2xx status pages rate resource "datadog_monitor" "appservices_http_2xx_status_rate" { - name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests" - type = "query alert" - message = "${var.message}" - escalation_message = "${var.http_2xx_status_rate_escalation_message}" + name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests" + type = "query alert" + message = "${var.message}" - query = < Date: Thu, 23 Nov 2017 17:52:01 +0100 Subject: [PATCH 64/93] MON-74 Normalize monitors --- cloud/azure/app-services/README.md | 54 +-------- cloud/azure/app-services/inputs.tf | 113 +----------------- .../app-services/monitors-app_services.tf | 63 +++++----- 3 files changed, 40 insertions(+), 190 deletions(-) diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index 90f5882..e56fac2 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -8,10 +8,8 @@ How to use this module module "datadog-monitors-azure-app-services" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/app-services?ref={revision}" - message = "${module.datadog-message-alerting.alerting-message}" - + message = "${module.datadog-message-alerting.alerting-message}" environment = "${var.environment}" - client_name = "${var.client_name}" } ``` @@ -28,65 +26,23 @@ Creates a DataDog monitors with the following checks : Inputs ------ -| Name | Description | Type | Default | Required | DESKTOP-0PBDRFR: ~ -|------|-------------|:----:|:-----:|:-----:| → -| client_name | Client Name | string | - | yes | +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| http_2xx_status_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_# -m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | http_2xx_status_rate_limit | | string | `30` | no | -| http_2xx_status_rate_require_full_window | A boolean indicating whether this monitor needs a full window of data bef -ore it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be s -kipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `true` -| no | -| http_2xx_status_rate_tags | A list of tags to associate with your monitor. This can help you categorize and filter m -onitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying - via the API | string | `` | no | | http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | | http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | -| http_2xx_status_rate_timeout_h | The number of hours of the monitor not reporting data before it will automatically -resolve from a triggered state. Defaults to false. | string | `false` | no | -| http_404_errors_count_rate_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write -last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | | http_404_errors_count_rate_limit | | string | `30` | no | -| http_404_errors_count_rate_require_full_window | A boolean indicating whether this monitor needs a full window of da -ta before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations wil -l be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | ` -true` | no | -| http_404_errors_count_rate_tags | A list of tags to associate with your monitor. This can help you categorize and fi -lter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when qu -erying via the API | string | `` | no | | http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | | http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | -| http_404_errors_count_rate_timeout_h | The number of hours of the monitor not reporting data before it will automati -cally resolve from a triggered state. Defaults to false. | string | `false` | no | -| memory_usage_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, - 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `5m` | no | -| memory_usage_require_full_window | A boolean indicating whether this monitor needs a full window of data before it's - evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. -Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| memory_usage_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors -in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the - API | string | `` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | -| memory_usage_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve -from a triggered state. Defaults to false. | string | `false` | no | | message | Message sent when a monitor is triggered | string | - | yes | -| response_time_last_time_window_code | Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5 -, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2) | string | `1h` | no | -| response_time_require_full_window | A boolean indicating whether this monitor needs a full window of data before it' -s evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. - Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise. | string | `false` | no | -| response_time_tags | A list of tags to associate with your monitor. This can help you categorize and filter monitors - in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via th -e API | string | `` | no | | response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | | response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | -| response_time_timeout_h | The number of hours of the monitor not reporting data before it will automatically resolve - from a triggered state. Defaults to false. | string | `false` | no | -| use_filter_tags | Filter the data with service tags if true | string | `true` | no | Related documentation --------------------- diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 5f0f2b0..c4bc451 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -3,14 +3,14 @@ variable "environment" { type = "string" } -variable "client_name" { - description = "Client Name" - type = "string" +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" } -variable "use_filter_tags" { - description = "Filter the data with service tags if true" - default = "true" +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" } variable "message" { @@ -36,31 +36,6 @@ variable "response_time_threshold_warning" { description = "Warning threshold in seconds" } -variable "response_time_last_time_window_code" { - default = "1h" - description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" -} - -variable "response_time_require_full_window" { - default = false - description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." -} - -variable "response_time_tags" { - default = [] - description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" -} - -variable "response_time_timeout_h" { - default = false - description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." -} - -# variable "response_time_notify_no_data" { -# default = true -# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -# } - ################################### ### MEMORY USAGE VARIABLES ### ################################### @@ -75,31 +50,6 @@ variable "memory_usage_threshold_warning" { description = "Warning threshold in MiB" } -variable "memory_usage_last_time_window_code" { - default = "5m" - description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" -} - -variable "memory_usage_require_full_window" { - default = false - description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." -} - -variable "memory_usage_tags" { - default = [] - description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" -} - -variable "memory_usage_timeout_h" { - default = false - description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." -} - -# variable "memory_usage_notify_no_data" { -# default = true -# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -# } - ################################# ### HTTP 404 status pages ### ################################# @@ -118,31 +68,6 @@ variable "http_404_errors_count_rate_threshold_warning" { description = "Warning threshold (number of requests)" } -variable "http_404_errors_count_rate_last_time_window_code" { - default = "5m" - description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" -} - -variable "http_404_errors_count_rate_require_full_window" { - default = true - description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." -} - -variable "http_404_errors_count_rate_tags" { - default = [] - description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" -} - -variable "http_404_errors_count_rate_timeout_h" { - default = false - description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." -} - -# variable "http_404_errors_count_rate_notify_no_data" { -# default = true -# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -# } - ################################# ### HTTP 202 status pages ### ################################# @@ -160,29 +85,3 @@ variable "http_2xx_status_rate_threshold_warning" { default = 0.95 description = "Warning threshold (percentage)" } - -variable "http_2xx_status_rate_last_time_window_code" { - default = "5m" - description = "Query time window code, can be: 1h|4h|1d|2d|1w|1m|3m... to write last_#m (1, 5, 10, 15, or 30), last_#h (1, 2, or 4), or last_#d (1 or 2)" -} - -variable "http_2xx_status_rate_require_full_window" { - default = true - description = "A boolean indicating whether this monitor needs a full window of data before it's evaluated. We highly recommend you set this to False for sparse metrics, otherwise some evaluations will be skipped. Default: True for 'on average', 'at all times' and 'in total' aggregation. False otherwise." -} - -variable "http_2xx_status_rate_tags" { - default = [] - description = "A list of tags to associate with your monitor. This can help you categorize and filter monitors in the manage monitors page of the UI. Note: it's not currently possible to filter by these tags when querying via the API" -} - -variable "http_2xx_status_rate_timeout_h" { - default = false - description = "The number of hours of the monitor not reporting data before it will automatically resolve from a triggered state. Defaults to false." -} - -# variable "http_2xx_status_rate_notify_no_data" { -# default = true -# description = " boolean indicating whether this monitor will notify when data stops reporting. Defaults to true." -# } - diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 437b7fb..1cff1af 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -2,18 +2,18 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_appservices:enabled,env:%s", var.environment) : "*"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } # Monitoring App Services response time resource "datadog_monitor" "appservices_response_time" { - name = "[${var.environment}] App Services response time {{value}}s is above ${var.response_time_threshold_critical}s" - type = "query alert" + name = "[${var.environment}] App Services response time > ${var.response_time_threshold_critical}s on {{name}}" + type = "metric alert" message = "${var.message}" query = <= ${var.response_time_threshold_critical} EOF @@ -26,24 +26,23 @@ resource "datadog_monitor" "appservices_response_time" { critical = "${var.response_time_threshold_critical}" } - notify_no_data = true # Will notify when no data is received + notify_no_data = true # Will notify when no data is received renotify_interval = 0 - require_full_window = "${var.response_time_require_full_window}" + require_full_window = true + timeout_h = 0 + include_tags = true - timeout_h = "${var.response_time_timeout_h}" - include_tags = true - - tags = "${var.response_time_tags}" + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } # Monitoring App Services memory usage resource "datadog_monitor" "appservices_memory_usage_count" { - name = "[${var.environment}] App Services memory usage {{value}} bytes is above ${ceil(var.memory_usage_threshold_critical/1000000)}MiB" - type = "query alert" + name = "[${var.environment}] App Services memory usage > ${ceil(var.memory_usage_threshold_critical/1000000)}MiB on {{name}}" + type = "metric alert" message = "${var.message}" query = <= ${var.memory_usage_threshold_critical} EOF @@ -58,22 +57,21 @@ resource "datadog_monitor" "appservices_memory_usage_count" { notify_no_data = true # Will notify when no data is received renotify_interval = 0 - require_full_window = "${var.memory_usage_require_full_window}" + require_full_window = true + timeout_h = 0 + include_tags = true - timeout_h = "${var.memory_usage_timeout_h}" - include_tags = true - - tags = "${var.memory_usage_tags}" + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } # Monitoring App Services 404 errors rate resource "datadog_monitor" "appservices_http_404_errors_count" { - name = "[${var.environment}] App Services {{value}} HTTP errors > ${var.http_404_errors_count_rate_limit} limit" - type = "query alert" + name = "[${var.environment}] App Services HTTP errors > ${var.http_404_errors_count_rate_limit} limit on {{name}}" + type = "metric alert" message = "${var.message}" query = < ${var.http_404_errors_count_rate_threshold_critical} EOF @@ -89,21 +87,20 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { notify_no_data = false # Will NOT notify when no data is received renotify_interval = 0 require_full_window = true + timeout_h = 0 + include_tags = true - timeout_h = "${var.http_404_errors_count_rate_timeout_h}" - include_tags = true - - tags = "${var.http_404_errors_count_rate_tags}" + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } # Monitoring App Services HTTP 2xx status pages rate resource "datadog_monitor" "appservices_http_2xx_status_rate" { - name = "[${var.environment}] App Services {{value}} Too much non 2xx HTTP status in response to the requests" - type = "query alert" + name = "[${var.environment}] App Services Too much non 2xx HTTP status in response to the requests on {{name}}" + type = "metric alert" message = "${var.message}" query = < Date: Thu, 23 Nov 2017 17:34:30 +0100 Subject: [PATCH 65/93] MON-75 Normalize monitors --- cloud/azure/sql-database/README.md | 20 ++++++----- cloud/azure/sql-database/inputs.tf | 26 ++++---------- .../monitors-sql-database-basics.tf | 34 ++++++++++++------- 3 files changed, 40 insertions(+), 40 deletions(-) diff --git a/cloud/azure/sql-database/README.md b/cloud/azure/sql-database/README.md index f135036..7d815e3 100644 --- a/cloud/azure/sql-database/README.md +++ b/cloud/azure/sql-database/README.md @@ -1,5 +1,5 @@ Azure SQL Database DataDog monitors -============================ +=================================== How to use this module ---------------------- @@ -8,10 +8,8 @@ How to use this module module "datadog-monitors-azure-storage" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/sql-database?ref={revision}" - message = "${module.datadog-message-alerting.alerting-message}" - + message = "${module.datadog-message-alerting.alerting-message}" environment = "${var.environment}" - client_name = "${var.client_name}" } ``` @@ -38,8 +36,14 @@ Inputs | dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no | | dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no | | environment | Architecture Environment | string | - | yes | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | message | Message sent when an alert is triggered | string | - | yes | -| provider | Cloud provider which the monitor and its based metric depend on | string | `azure` | no | -| service | Service monitored by this set of monitors | string | `sql-database` | no | -| subscription_id | Azure account id used as filter for monitors | string | - | yes | -| use_filter_tags | Filter the data with service tags if true | string | `true` | no | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/azure_sql_database/](https://docs.datadoghq.com/integrations/azure_sql_database/) + +Azure SQL Database metrics documentation: [https://docs.microsoft.com/en-us/azure/sql-database/saas-dbpertenant-log-analytics](https://docs.microsoft.com/en-us/azure/sql-database/saas-dbpertenant-log-analytics) + diff --git a/cloud/azure/sql-database/inputs.tf b/cloud/azure/sql-database/inputs.tf index d3abe46..9ddab06 100644 --- a/cloud/azure/sql-database/inputs.tf +++ b/cloud/azure/sql-database/inputs.tf @@ -4,23 +4,6 @@ variable "environment" { type = "string" } -variable "subscription_id" { - description = "Azure account id used as filter for monitors" - type = "string" -} - -variable "provider" { - description = "Cloud provider which the monitor and its based metric depend on" - type = "string" - default = "azure" -} - -variable "service" { - description = "Service monitored by this set of monitors" - type = "string" - default = "sql-database" -} - # Global DataDog variable "delay" { description = "Delay in seconds for the metric evaluation" @@ -31,11 +14,16 @@ variable "message" { description = "Message sent when an alert is triggered" } -variable "use_filter_tags" { - description = "Filter the data with service tags if true" +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" default = "true" } +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + # Azure SQL Database specific variable "cpu_threshold_warning" { diff --git a/cloud/azure/sql-database/monitors-sql-database-basics.tf b/cloud/azure/sql-database/monitors-sql-database-basics.tf index 5448b8a..1e75813 100644 --- a/cloud/azure/sql-database/monitors-sql-database-basics.tf +++ b/cloud/azure/sql-database/monitors-sql-database-basics.tf @@ -2,13 +2,13 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_sqldb:enabled,env:%s",var.environment) : "*"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_sqldatabase:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } resource "datadog_monitor" "sql-database_cpu_90_15min" { - name = "[${var.environment}] SQL Database CPU high > 90% for 15 min on {{name}}" - message = "${message}" + name = "[${var.environment}] SQL Database CPU high > ${var.cpu_threshold_critical}% on {{name}}" + message = "${var.message}" query = < ${var.cpu_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { critical = "${var.cpu_threshold_critical}" @@ -32,13 +32,15 @@ resource "datadog_monitor" "sql-database_cpu_90_15min" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:sqldatabase", "team:azure", "provider:azure"] } resource "datadog_monitor" "sql-database_free_space_low" { - name = "[${var.environment}] SQL Database free space < 10 % on {{name}}" - message = "${message}" + name = "[${var.environment}] SQL Database free space < ${var.diskspace_threshold_critical}% on {{name}}" + message = "${var.message}" - type = "query alert" + type = "metric alert" query = < Date: Fri, 24 Nov 2017 16:53:25 +0100 Subject: [PATCH 66/93] MON-90 Azure API Management monitors --- cloud/azure/apimanagement/README.md | 43 +++++ cloud/azure/apimanagement/inputs.tf | 46 +++++ .../monitors-azure-apimanagement.tf | 160 ++++++++++++++++++ 3 files changed, 249 insertions(+) create mode 100644 cloud/azure/apimanagement/README.md create mode 100644 cloud/azure/apimanagement/inputs.tf create mode 100644 cloud/azure/apimanagement/monitors-azure-apimanagement.tf diff --git a/cloud/azure/apimanagement/README.md b/cloud/azure/apimanagement/README.md new file mode 100644 index 0000000..e59e81a --- /dev/null +++ b/cloud/azure/apimanagement/README.md @@ -0,0 +1,43 @@ +Azure API Management Datadog monitors +===================================== + +How to use this module +---------------------- +``` +module "datadog-monitors-azure-apimanagement" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/apimanagement?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates Datadog monitors with the following checks : + +* Service status +* Failed requests ratio +* Other requests ratio +* Unauthorized requests ratio +* Successful requests ratio + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| environment | Architecture environment | string | - | yes | +| failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| message | Message sent when a Redis monitor is triggered | string | - | yes | +| other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no | +| successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no | +| unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no | + +Related documentation +--------------------- + +Azure API Management metrics documentation: [https://docs.microsoft.com/en-us/azure/api-management/api-management-howto-use-azure-monitor](https://docs.microsoft.com/en-us/azure/api-management/api-management-howto-use-azure-monitor) diff --git a/cloud/azure/apimanagement/inputs.tf b/cloud/azure/apimanagement/inputs.tf new file mode 100644 index 0000000..002593e --- /dev/null +++ b/cloud/azure/apimanagement/inputs.tf @@ -0,0 +1,46 @@ +# Global Terraform +variable "environment" { + description = "Architecture environment" + type = "string" +} + +# Global DataDog +variable "message" { + description = "Message sent when a Redis monitor is triggered" +} + +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# Azure API Management specific +variable "failed_requests_threshold_critical" { + description = "Maximum acceptable percent of failed requests" + default = 5 +} + +variable "other_requests_threshold_critical" { + description = "Maximum acceptable percent of other requests" + default = 5 +} + +variable "unauthorized_requests_threshold_critical" { + description = "Maximum acceptable percent of unauthorized requests" + default = 5 +} + +variable "successful_requests_threshold_critical" { + description = "Minimum acceptable percent of successful requests" + default = 90 +} diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf new file mode 100644 index 0000000..f7a55b1 --- /dev/null +++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf @@ -0,0 +1,160 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? + format("dd_monitoring:enabled,dd_azure_apimanagement:enabled,env:%s", var.environment) : + "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "apimgt_status" { + name = "[${var.environment}] API Management status is not ok on {{name}}" + message = "${var.message}" + + query = < ${var.failed_requests_threshold_critical} + EOF + + thresholds { + critical = "${var.failed_requests_threshold_critical}" + } + + type = "metric alert" + notify_no_data = false + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "apimgt_other_requests" { + name = "[${var.environment}] API Management {{name}} too much other requests" + message = "${var.message}" + + query = < ${var.other_requests_threshold_critical} + EOF + + thresholds { + critical = "${var.other_requests_threshold_critical}" + } + + type = "metric alert" + notify_no_data = false + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "apimgt_unauthorized_requests" { + name = "[${var.environment}] API Management {{name}} too much unauthorized requests" + message = "${var.message}" + + query = < ${var.unauthorized_requests_threshold_critical} + EOF + + thresholds { + critical = "${var.unauthorized_requests_threshold_critical}" + } + + type = "metric alert" + notify_no_data = false + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "apimgt_successful_requests" { + name = "[${var.environment}] API Management {{name}} successful requests rate too low" + message = "${var.message}" + + query = < Date: Sun, 26 Nov 2017 20:17:14 +0100 Subject: [PATCH 67/93] MON-80 convert all as_count queries to sum --- cloud/azure/iothubs/monitors-iothubs.tf | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 6e1f926..9719e59 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -11,7 +11,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { message = "${var.message}" query = < Date: Sun, 26 Nov 2017 20:26:18 +0100 Subject: [PATCH 68/93] MON-74 convert all as_count queries to sum --- cloud/azure/app-services/monitors-app_services.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 1cff1af..2c2f80e 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -100,7 +100,7 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { message = "${var.message}" query = < Date: Sun, 26 Nov 2017 20:32:35 +0100 Subject: [PATCH 69/93] MON-78 convert all as_count queries to sum --- cloud/azure/stream-analytics/monitors-stream-analytics.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index f72af1f..3ad187f 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -63,7 +63,7 @@ resource "datadog_monitor" "failed_function_requests" { message = "${var.message}" query = < ${var.failed_function_requests_threshold_critical} From 6b9c03947ad23b7e991c8b453582f56e93304db9 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Sun, 26 Nov 2017 20:37:22 +0100 Subject: [PATCH 70/93] MON-90 change no data to false because division --- cloud/azure/apimanagement/monitors-azure-apimanagement.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf index f7a55b1..82c3df7 100644 --- a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf +++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf @@ -145,7 +145,7 @@ resource "datadog_monitor" "apimgt_successful_requests" { } type = "metric alert" - notify_no_data = true + notify_no_data = false notify_audit = false timeout_h = 0 include_tags = true From d3bbb3ced5c4309f5b71622c79ff754d6bdd487a Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Fri, 24 Nov 2017 16:53:25 +0100 Subject: [PATCH 71/93] MON-90 Azure API Management monitors --- cloud/azure/apimanagement/README.md | 43 +++++ cloud/azure/apimanagement/inputs.tf | 46 +++++ .../monitors-azure-apimanagement.tf | 160 ++++++++++++++++++ 3 files changed, 249 insertions(+) create mode 100644 cloud/azure/apimanagement/README.md create mode 100644 cloud/azure/apimanagement/inputs.tf create mode 100644 cloud/azure/apimanagement/monitors-azure-apimanagement.tf diff --git a/cloud/azure/apimanagement/README.md b/cloud/azure/apimanagement/README.md new file mode 100644 index 0000000..e59e81a --- /dev/null +++ b/cloud/azure/apimanagement/README.md @@ -0,0 +1,43 @@ +Azure API Management Datadog monitors +===================================== + +How to use this module +---------------------- +``` +module "datadog-monitors-azure-apimanagement" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/apimanagement?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates Datadog monitors with the following checks : + +* Service status +* Failed requests ratio +* Other requests ratio +* Unauthorized requests ratio +* Successful requests ratio + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| environment | Architecture environment | string | - | yes | +| failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| message | Message sent when a Redis monitor is triggered | string | - | yes | +| other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no | +| successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no | +| unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no | + +Related documentation +--------------------- + +Azure API Management metrics documentation: [https://docs.microsoft.com/en-us/azure/api-management/api-management-howto-use-azure-monitor](https://docs.microsoft.com/en-us/azure/api-management/api-management-howto-use-azure-monitor) diff --git a/cloud/azure/apimanagement/inputs.tf b/cloud/azure/apimanagement/inputs.tf new file mode 100644 index 0000000..002593e --- /dev/null +++ b/cloud/azure/apimanagement/inputs.tf @@ -0,0 +1,46 @@ +# Global Terraform +variable "environment" { + description = "Architecture environment" + type = "string" +} + +# Global DataDog +variable "message" { + description = "Message sent when a Redis monitor is triggered" +} + +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# Azure API Management specific +variable "failed_requests_threshold_critical" { + description = "Maximum acceptable percent of failed requests" + default = 5 +} + +variable "other_requests_threshold_critical" { + description = "Maximum acceptable percent of other requests" + default = 5 +} + +variable "unauthorized_requests_threshold_critical" { + description = "Maximum acceptable percent of unauthorized requests" + default = 5 +} + +variable "successful_requests_threshold_critical" { + description = "Minimum acceptable percent of successful requests" + default = 90 +} diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf new file mode 100644 index 0000000..f7a55b1 --- /dev/null +++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf @@ -0,0 +1,160 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? + format("dd_monitoring:enabled,dd_azure_apimanagement:enabled,env:%s", var.environment) : + "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "apimgt_status" { + name = "[${var.environment}] API Management status is not ok on {{name}}" + message = "${var.message}" + + query = < ${var.failed_requests_threshold_critical} + EOF + + thresholds { + critical = "${var.failed_requests_threshold_critical}" + } + + type = "metric alert" + notify_no_data = false + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "apimgt_other_requests" { + name = "[${var.environment}] API Management {{name}} too much other requests" + message = "${var.message}" + + query = < ${var.other_requests_threshold_critical} + EOF + + thresholds { + critical = "${var.other_requests_threshold_critical}" + } + + type = "metric alert" + notify_no_data = false + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "apimgt_unauthorized_requests" { + name = "[${var.environment}] API Management {{name}} too much unauthorized requests" + message = "${var.message}" + + query = < ${var.unauthorized_requests_threshold_critical} + EOF + + thresholds { + critical = "${var.unauthorized_requests_threshold_critical}" + } + + type = "metric alert" + notify_no_data = false + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + evaluation_delay = "${var.delay}" + renotify_interval = 0 + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:apimanagement", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "apimgt_successful_requests" { + name = "[${var.environment}] API Management {{name}} successful requests rate too low" + message = "${var.message}" + + query = < Date: Mon, 27 Nov 2017 23:00:12 +0100 Subject: [PATCH 72/93] MON-74 add group by to all queries --- cloud/azure/app-services/monitors-app_services.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 2c2f80e..6bf3fd6 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -14,7 +14,7 @@ resource "datadog_monitor" "appservices_response_time" { query = <= ${var.response_time_threshold_critical} EOF @@ -43,7 +43,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" { query = <= ${var.memory_usage_threshold_critical} EOF @@ -72,7 +72,7 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { query = < ${var.http_404_errors_count_rate_threshold_critical} EOF @@ -102,7 +102,7 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { query = < Date: Mon, 27 Nov 2017 23:02:24 +0100 Subject: [PATCH 73/93] MON-75 add region to all group by --- cloud/azure/sql-database/monitors-sql-database-basics.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cloud/azure/sql-database/monitors-sql-database-basics.tf b/cloud/azure/sql-database/monitors-sql-database-basics.tf index 1e75813..337b28f 100644 --- a/cloud/azure/sql-database/monitors-sql-database-basics.tf +++ b/cloud/azure/sql-database/monitors-sql-database-basics.tf @@ -12,7 +12,7 @@ resource "datadog_monitor" "sql-database_cpu_90_15min" { query = < ${var.cpu_threshold_critical} EOF @@ -44,7 +44,7 @@ resource "datadog_monitor" "sql-database_free_space_low" { query = < ${var.diskspace_threshold_critical} EOF @@ -75,7 +75,7 @@ resource "datadog_monitor" "sql-database_dtu_consumption_high" { query = < ${var.dtu_threshold_critical} EOF @@ -106,7 +106,7 @@ resource "datadog_monitor" "sql-database_deadlocks_count" { query = < ${var.deadlock_threshold_critical} EOF From 855e52a36fb1288dae41a356a032a56955dded62 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Mon, 27 Nov 2017 23:03:14 +0100 Subject: [PATCH 74/93] MON-76 add region to all group by --- cloud/azure/redis/monitors-azure-redis.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cloud/azure/redis/monitors-azure-redis.tf b/cloud/azure/redis/monitors-azure-redis.tf index 57b3a6c..8e68558 100644 --- a/cloud/azure/redis/monitors-azure-redis.tf +++ b/cloud/azure/redis/monitors-azure-redis.tf @@ -11,7 +11,7 @@ resource "datadog_monitor" "status" { message = "${var.message}" query = < ${var.evictedkeys_limit_threshold_critical} EOF @@ -67,7 +67,7 @@ resource "datadog_monitor" "percent_processor_time" { query = < ${var.percent_processor_time_threshold_critical} EOF @@ -98,7 +98,7 @@ resource "datadog_monitor" "server_load" { query = < ${var.server_load_rate_threshold_critical} EOF From 3934e869a1016bbe2aa1637e8bebaea359794d9b Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Mon, 27 Nov 2017 23:19:33 +0100 Subject: [PATCH 75/93] MON-77 improve queries adding as_count --- cloud/azure/eventhub/monitors-eventhub.tf | 34 +++++++++++------------ 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf index ff52507..4627106 100644 --- a/cloud/azure/eventhub/monitors-eventhub.tf +++ b/cloud/azure/eventhub/monitors-eventhub.tf @@ -11,7 +11,7 @@ resource "datadog_monitor" "eventhub_status" { message = "${var.message}" query = < ${var.failed_requests_rate_thresold_critical} + sum(last_5m): ( + avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() / + ( + avg:azure.eventhub_namespaces.successful_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + + avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + ) * 100 > ${var.failed_requests_rate_thresold_critical} EOF type = "metric alert" @@ -67,16 +67,16 @@ resource "datadog_monitor" "eventhub_errors" { message = "${var.message}" query = < ${var.errors_rate_thresold_critical} + sum(last_5m): ( + avg:azure.eventhub_namespaces.internal_server_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + + avg:azure.eventhub_namespaces.server_busy_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + + avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + ) / ( + avg:azure.eventhub_namespaces.successful_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + + avg:azure.eventhub_namespaces.internal_server_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + + avg:azure.eventhub_namespaces.server_busy_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + + avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + ) * 100 > ${var.errors_rate_thresold_critical} EOF type = "metric alert" From 00e1ada46ed9af49a0d6c7360e1e4f5e6406fc24 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Mon, 27 Nov 2017 23:29:06 +0100 Subject: [PATCH 76/93] MON-74 fix percent query --- cloud/azure/app-services/monitors-app_services.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 6bf3fd6..aedc748 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -101,8 +101,8 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { query = < Date: Mon, 27 Nov 2017 23:31:10 +0100 Subject: [PATCH 77/93] MON-78 reorder groupy from less to must specific --- .../stream-analytics/monitors-stream-analytics.tf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 3ad187f..fe4e983 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -11,7 +11,7 @@ resource "datadog_monitor" "status" { message = "${var.message}" query = < ${var.su_utilization_threshold_critical} EOF type = "metric alert" @@ -64,8 +64,8 @@ resource "datadog_monitor" "failed_function_requests" { query = < ${var.failed_function_requests_threshold_critical} EOF type = "metric alert" @@ -94,7 +94,7 @@ resource "datadog_monitor" "conversion_errors" { query = < ${var.conversion_errors_threshold_critical} EOF type = "metric alert" @@ -123,7 +123,7 @@ resource "datadog_monitor" "runtime_errors" { query = < ${var.runtime_errors_threshold_critical} EOF type = "metric alert" From b30a0e2689716abcfb0de787cb0a8080f929c229 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Mon, 27 Nov 2017 23:32:25 +0100 Subject: [PATCH 78/93] MON-80 add region to group by --- cloud/azure/iothubs/monitors-iothubs.tf | 52 ++++++++++++------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 9719e59..9388f1c 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -12,9 +12,9 @@ resource "datadog_monitor" "too_many_jobs_failed" { query = < ${var.failed_jobs_rate_threshold_critical} EOF @@ -110,7 +110,7 @@ resource "datadog_monitor" "status" { message = "${var.message}" query = < ${var.failed_c2d_methods_rate_threshold_critical} EOF @@ -192,9 +192,9 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { query = < ${var.failed_c2d_twin_read_rate_threshold_critical} EOF @@ -225,9 +225,9 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { query = < ${var.failed_c2d_twin_update_rate_threshold_critical} EOF @@ -258,9 +258,9 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { query = < ${var.failed_d2c_twin_read_rate_threshold_critical} EOF @@ -291,9 +291,9 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { query = < ${var.failed_d2c_twin_update_rate_threshold_critical} EOF @@ -324,7 +324,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { query = < ${var.dropped_d2c_telemetry_egress_threshold_critical} EOF @@ -355,7 +355,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { query = < ${var.orphaned_d2c_telemetry_egress_threshold_critical} EOF @@ -386,7 +386,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { query = < ${var.invalid_d2c_telemetry_egress_threshold_critical} EOF @@ -417,7 +417,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { query = < ${var.fallback_d2c_telemetry_egress_threshold_critical} EOF @@ -448,8 +448,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { query = < 0 EOF From 835942e6e1942620826c5120eb3263a23b7038cd Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Mon, 27 Nov 2017 23:33:15 +0100 Subject: [PATCH 79/93] MON-78 add region to group by --- .../stream-analytics/monitors-stream-analytics.tf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index fe4e983..3b1324a 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -11,7 +11,7 @@ resource "datadog_monitor" "status" { message = "${var.message}" query = < ${var.su_utilization_threshold_critical} EOF type = "metric alert" @@ -64,8 +64,8 @@ resource "datadog_monitor" "failed_function_requests" { query = < ${var.failed_function_requests_threshold_critical} EOF type = "metric alert" @@ -94,7 +94,7 @@ resource "datadog_monitor" "conversion_errors" { query = < ${var.conversion_errors_threshold_critical} EOF type = "metric alert" @@ -123,7 +123,7 @@ resource "datadog_monitor" "runtime_errors" { query = < ${var.runtime_errors_threshold_critical} EOF type = "metric alert" From 4d42b8832513824f8971d8d809807917ae79442d Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Mon, 27 Nov 2017 23:39:54 +0100 Subject: [PATCH 80/93] MON-90 fix queries syntax with as_count --- .../monitors-azure-apimanagement.tf | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf index 82c3df7..c427d21 100644 --- a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf +++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf @@ -13,7 +13,7 @@ resource "datadog_monitor" "apimgt_status" { message = "${var.message}" query = < ${var.failed_requests_threshold_critical} EOF @@ -72,9 +71,8 @@ resource "datadog_monitor" "apimgt_other_requests" { query = < ${var.other_requests_threshold_critical} EOF @@ -103,9 +101,8 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" { query = < ${var.unauthorized_requests_threshold_critical} EOF @@ -134,9 +131,8 @@ resource "datadog_monitor" "apimgt_successful_requests" { query = < Date: Thu, 7 Dec 2017 10:40:04 +0100 Subject: [PATCH 81/93] MON-74 percent for requests --- cloud/azure/app-services/README.md | 15 ++++------ cloud/azure/app-services/inputs.tf | 30 +++++++------------ .../app-services/monitors-app_services.tf | 23 +++++++------- 3 files changed, 29 insertions(+), 39 deletions(-) diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index e56fac2..ab49366 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -19,9 +19,8 @@ Creates a DataDog monitors with the following checks : * Response time * Memory usage count -* HTTP 404 errors -* HTTP 50x errors -* HTTP 20x rate +* HTTP 404 requests +* HTTP 2xx requests Inputs ------ @@ -32,12 +31,10 @@ Inputs | environment | Architecture environment | string | - | yes | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | -| http_2xx_status_rate_limit | | string | `30` | no | -| http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | -| http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | -| http_404_errors_count_rate_limit | | string | `30` | no | -| http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | -| http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | +| http_2xx_requests_threshold_critical | Minimum critical acceptable percent of 2xx requests | string | `90` | no | +| http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no | +| http_404_requests_threshold_critical | Maximum critical acceptable percent of 404 errors | string | `40` | no | +| http_404_requests_threshold_warning | Maximum warning acceptable percent of 404 errors | string | `30` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | | message | Message sent when a monitor is triggered | string | - | yes | diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index c4bc451..541a0e7 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -54,34 +54,26 @@ variable "memory_usage_threshold_warning" { ### HTTP 404 status pages ### ################################# -variable "http_404_errors_count_rate_limit" { - default = 30 +variable "http_404_requests_threshold_critical" { + default = 40 + description = "Maximum critical acceptable percent of 404 errors" } -variable "http_404_errors_count_rate_threshold_critical" { +variable "http_404_requests_threshold_warning" { default = 30 - description = "Alerting threshold (number of requests)" -} - -variable "http_404_errors_count_rate_threshold_warning" { - default = 10 - description = "Warning threshold (number of requests)" + description = "Maximum critical acceptable percent of 404 errors" } ################################# ### HTTP 202 status pages ### ################################# -variable "http_2xx_status_rate_limit" { - default = 30 +variable "http_2xx_requests_threshold_critical" { + default = 90 + description = "Minimum critical acceptable percent of 2xx requests" } -variable "http_2xx_status_rate_threshold_critical" { - default = 0.9 - description = "Alerting threshold (percentage)" -} - -variable "http_2xx_status_rate_threshold_warning" { - default = 0.95 - description = "Warning threshold (percentage)" +variable "http_2xx_requests_threshold_warning" { + default = 95 + description = "Minimum warning acceptable percent of 2xx requests" } diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index aedc748..0abc8fd 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -15,7 +15,7 @@ resource "datadog_monitor" "appservices_response_time" { query = <= ${var.response_time_threshold_critical} + ) > ${var.response_time_threshold_critical} EOF evaluation_delay = "${var.delay}" @@ -44,7 +44,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" { query = <= ${var.memory_usage_threshold_critical} + ) > ${var.memory_usage_threshold_critical} EOF evaluation_delay = "${var.delay}" @@ -71,17 +71,18 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { message = "${var.message}" query = < ${var.http_404_errors_count_rate_threshold_critical} + sum(last_5m): ( + avg:azure.app_services.http404{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() / + avg:azure.app_services.requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + ) * 100 > ${var.http_404_requests_threshold_critical} EOF evaluation_delay = "${var.delay}" new_host_delay = "${var.delay}" thresholds { - warning = "${var.http_404_errors_count_rate_threshold_warning}" - critical = "${var.http_404_errors_count_rate_threshold_critical}" + warning = "${var.http_404_requests_threshold_warning}" + critical = "${var.http_404_requests_threshold_critical}" } notify_no_data = false # Will NOT notify when no data is received @@ -102,16 +103,16 @@ resource "datadog_monitor" "appservices_http_2xx_status_rate" { query = < Date: Thu, 7 Dec 2017 10:43:23 +0100 Subject: [PATCH 82/93] MON-77 missing parenthesis --- cloud/azure/eventhub/monitors-eventhub.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf index 4627106..bfb5775 100644 --- a/cloud/azure/eventhub/monitors-eventhub.tf +++ b/cloud/azure/eventhub/monitors-eventhub.tf @@ -35,7 +35,7 @@ resource "datadog_monitor" "eventhub_failed_requests" { query = < Date: Thu, 7 Dec 2017 10:59:22 +0100 Subject: [PATCH 83/93] MON-90 fix failed to unauthorized requests --- cloud/azure/apimanagement/monitors-azure-apimanagement.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf index c427d21..a1a0457 100644 --- a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf +++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf @@ -101,7 +101,7 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" { query = < ${var.unauthorized_requests_threshold_critical} EOF From 886ae437f4ceffe7b8fbfd738724bc1638c2f0df Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Mon, 11 Dec 2017 11:41:32 +0100 Subject: [PATCH 84/93] MON-74 Fix non existent variable --- cloud/azure/app-services/monitors-app_services.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 0abc8fd..669e9db 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -66,7 +66,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" { # Monitoring App Services 404 errors rate resource "datadog_monitor" "appservices_http_404_errors_count" { - name = "[${var.environment}] App Services HTTP errors > ${var.http_404_errors_count_rate_limit} limit on {{name}}" + name = "[${var.environment}] App Services HTTP errors > {{value}}% limit on {{name}}" type = "metric alert" message = "${var.message}" From 7de2bf4aca61736a28d143260e4d91799bca2ec6 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 15 Dec 2017 13:13:09 +0100 Subject: [PATCH 85/93] MON-74 decrease thresholds for 404 errors --- cloud/azure/app-services/inputs.tf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 541a0e7..96c2892 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -55,17 +55,17 @@ variable "memory_usage_threshold_warning" { ################################# variable "http_404_requests_threshold_critical" { - default = 40 - description = "Maximum critical acceptable percent of 404 errors" -} - -variable "http_404_requests_threshold_warning" { default = 30 description = "Maximum critical acceptable percent of 404 errors" } +variable "http_404_requests_threshold_warning" { + default = 15 + description = "Maximum warning acceptable percent of 404 errors" +} + ################################# -### HTTP 202 status pages ### +### HTTP 2xx status pages ### ################################# variable "http_2xx_requests_threshold_critical" { From 6cb41b8fbb08f424f59ab13187028809d2b2a984 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 15 Dec 2017 13:14:30 +0100 Subject: [PATCH 86/93] MON-74 fix response time monitor name --- cloud/azure/app-services/monitors-app_services.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 669e9db..d473fd5 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -8,7 +8,7 @@ data "template_file" "filter" { # Monitoring App Services response time resource "datadog_monitor" "appservices_response_time" { - name = "[${var.environment}] App Services response time > ${var.response_time_threshold_critical}s on {{name}}" + name = "[${var.environment}] App Services response time of {{value}}s is to high on {{name}}" type = "metric alert" message = "${var.message}" From 3a56b974c106da967214dc387020623a5e427da5 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 15 Dec 2017 13:28:11 +0100 Subject: [PATCH 87/93] MON-74 Add 5xx errors monitor --- cloud/azure/app-services/README.md | 3 ++ cloud/azure/app-services/inputs.tf | 14 +++++++ .../app-services/monitors-app_services.tf | 38 +++++++++++++++++-- 3 files changed, 51 insertions(+), 4 deletions(-) diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index ab49366..b439492 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -19,6 +19,7 @@ Creates a DataDog monitors with the following checks : * Response time * Memory usage count +* HTTP 5xx requests * HTTP 404 requests * HTTP 2xx requests @@ -35,6 +36,8 @@ Inputs | http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no | | http_404_requests_threshold_critical | Maximum critical acceptable percent of 404 errors | string | `40` | no | | http_404_requests_threshold_warning | Maximum warning acceptable percent of 404 errors | string | `30` | no | +| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `40` | no | +| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `30` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | | message | Message sent when a monitor is triggered | string | - | yes | diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index 96c2892..bc50156 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -50,6 +50,20 @@ variable "memory_usage_threshold_warning" { description = "Warning threshold in MiB" } +################################# +### HTTP 5xx status pages ### +################################# + +variable "http_5xx_requests_threshold_critical" { + default = 20 + description = "Maximum critical acceptable percent of 5xx errors" +} + +variable "http_5xx_requests_threshold_warning" { + default = 10 + description = "Maximum warning acceptable percent of 5xx errors" +} + ################################# ### HTTP 404 status pages ### ################################# diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index d473fd5..3f8b49b 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -64,9 +64,39 @@ resource "datadog_monitor" "appservices_memory_usage_count" { tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } -# Monitoring App Services 404 errors rate +# Monitoring App Services 5xx errors percent +resource "datadog_monitor" "appservices_http_5xx_errors_count" { + name = "[${var.environment}] App Services HTTP 5xx errors is {{value}}% above the limit on {{name}}" + type = "metric alert" + message = "${var.message}" + + query = < ${var.http_5xx_requests_threshold_critical} + EOF + + evaluation_delay = "${var.delay}" + new_host_delay = "${var.delay}" + + thresholds { + warning = "${var.http_5xx_requests_threshold_warning}" + critical = "${var.http_5xx_requests_threshold_critical}" + } + + notify_no_data = false # Will NOT notify when no data is received + renotify_interval = 0 + require_full_window = true + timeout_h = 0 + include_tags = true + + tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] +} + +# Monitoring App Services 404 errors percent resource "datadog_monitor" "appservices_http_404_errors_count" { - name = "[${var.environment}] App Services HTTP errors > {{value}}% limit on {{name}}" + name = "[${var.environment}] App Services HTTP 404 errors is {{value}}% above the limit on {{name}}" type = "metric alert" message = "${var.message}" @@ -94,9 +124,9 @@ resource "datadog_monitor" "appservices_http_404_errors_count" { tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } -# Monitoring App Services HTTP 2xx status pages rate +# Monitoring App Services HTTP 2xx status pages percent resource "datadog_monitor" "appservices_http_2xx_status_rate" { - name = "[${var.environment}] App Services Too much non 2xx HTTP status in response to the requests on {{name}}" + name = "[${var.environment}] App Services HTTP 2xx responses is {{value}}% below the limit on {{name}}" type = "metric alert" message = "${var.message}" From e3e3469cfbac1f36c0e8abfe5a8447145447bfbd Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 15 Dec 2017 13:29:25 +0100 Subject: [PATCH 88/93] MON-74 Change 404 errors to 4xx --- cloud/azure/app-services/README.md | 6 +++--- cloud/azure/app-services/inputs.tf | 10 +++++----- cloud/azure/app-services/monitors-app_services.tf | 14 +++++++------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index b439492..fac9581 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -20,7 +20,7 @@ Creates a DataDog monitors with the following checks : * Response time * Memory usage count * HTTP 5xx requests -* HTTP 404 requests +* HTTP 4xx requests * HTTP 2xx requests Inputs @@ -34,8 +34,8 @@ Inputs | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | http_2xx_requests_threshold_critical | Minimum critical acceptable percent of 2xx requests | string | `90` | no | | http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no | -| http_404_requests_threshold_critical | Maximum critical acceptable percent of 404 errors | string | `40` | no | -| http_404_requests_threshold_warning | Maximum warning acceptable percent of 404 errors | string | `30` | no | +| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `40` | no | +| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `30` | no | | http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `40` | no | | http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `30` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | diff --git a/cloud/azure/app-services/inputs.tf b/cloud/azure/app-services/inputs.tf index bc50156..3085251 100644 --- a/cloud/azure/app-services/inputs.tf +++ b/cloud/azure/app-services/inputs.tf @@ -65,17 +65,17 @@ variable "http_5xx_requests_threshold_warning" { } ################################# -### HTTP 404 status pages ### +### HTTP 4xx status pages ### ################################# -variable "http_404_requests_threshold_critical" { +variable "http_4xx_requests_threshold_critical" { default = 30 - description = "Maximum critical acceptable percent of 404 errors" + description = "Maximum critical acceptable percent of 4xx errors" } -variable "http_404_requests_threshold_warning" { +variable "http_4xx_requests_threshold_warning" { default = 15 - description = "Maximum warning acceptable percent of 404 errors" + description = "Maximum warning acceptable percent of 4xx errors" } ################################# diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 3f8b49b..02cf2d9 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -94,25 +94,25 @@ resource "datadog_monitor" "appservices_http_5xx_errors_count" { tags = ["env:${var.environment}", "resource:appservices", "team:azure", "provider:azure"] } -# Monitoring App Services 404 errors percent -resource "datadog_monitor" "appservices_http_404_errors_count" { - name = "[${var.environment}] App Services HTTP 404 errors is {{value}}% above the limit on {{name}}" +# Monitoring App Services 4xx errors percent +resource "datadog_monitor" "appservices_http_4xx_errors_count" { + name = "[${var.environment}] App Services HTTP 4xx errors is {{value}}% above the limit on {{name}}" type = "metric alert" message = "${var.message}" query = < ${var.http_404_requests_threshold_critical} + ) * 100 > ${var.http_4xx_requests_threshold_critical} EOF evaluation_delay = "${var.delay}" new_host_delay = "${var.delay}" thresholds { - warning = "${var.http_404_requests_threshold_warning}" - critical = "${var.http_404_requests_threshold_critical}" + warning = "${var.http_4xx_requests_threshold_warning}" + critical = "${var.http_4xx_requests_threshold_critical}" } notify_no_data = false # Will NOT notify when no data is received From b2d807fa46495572e8981ae749fa6ca7ad854826 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 15 Dec 2017 15:51:55 +0100 Subject: [PATCH 89/93] MON-74 update readme with new thresholds --- cloud/azure/app-services/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cloud/azure/app-services/README.md b/cloud/azure/app-services/README.md index fac9581..dc9e526 100644 --- a/cloud/azure/app-services/README.md +++ b/cloud/azure/app-services/README.md @@ -34,10 +34,10 @@ Inputs | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | http_2xx_requests_threshold_critical | Minimum critical acceptable percent of 2xx requests | string | `90` | no | | http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no | -| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `40` | no | -| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `30` | no | -| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `40` | no | -| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `30` | no | +| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no | +| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no | +| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no | +| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no | | memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | | message | Message sent when a monitor is triggered | string | - | yes | From 66747bda71b12563f8ffcc026a0dbc1bcaed4844 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 15 Dec 2017 15:59:56 +0100 Subject: [PATCH 90/93] MON-90 change avg to sum for all as_count queries --- cloud/azure/apimanagement/monitors-azure-apimanagement.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf index a1a0457..2a23126 100644 --- a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf +++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf @@ -40,7 +40,7 @@ resource "datadog_monitor" "apimgt_failed_requests" { message = "${var.message}" query = < ${var.failed_requests_threshold_critical} @@ -70,7 +70,7 @@ resource "datadog_monitor" "apimgt_other_requests" { message = "${var.message}" query = < ${var.other_requests_threshold_critical} @@ -100,7 +100,7 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" { message = "${var.message}" query = < ${var.unauthorized_requests_threshold_critical} @@ -130,7 +130,7 @@ resource "datadog_monitor" "apimgt_successful_requests" { message = "${var.message}" query = < Date: Mon, 27 Nov 2017 17:13:45 +0100 Subject: [PATCH 91/93] MON-73 Azure managed services monitors base feature --- cloud/azure/README.md | 118 +++++++ cloud/azure/inputs.tf | 395 ++++++++++++++++++++++++ cloud/azure/iothubs/README.md | 3 +- cloud/azure/iothubs/inputs.tf | 11 +- cloud/azure/iothubs/monitors-iothubs.tf | 72 ++--- cloud/azure/monitors.tf | 166 ++++++++++ 6 files changed, 715 insertions(+), 50 deletions(-) create mode 100644 cloud/azure/README.md create mode 100644 cloud/azure/inputs.tf create mode 100644 cloud/azure/monitors.tf diff --git a/cloud/azure/README.md b/cloud/azure/README.md new file mode 100644 index 0000000..5dcff20 --- /dev/null +++ b/cloud/azure/README.md @@ -0,0 +1,118 @@ +Azure monitors +============== + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a set of Azure DataDog monitors for the following components : + +* Azure App Services monitors +* Azure SQL monitors +* Azure Redis monitors +* Azure Event Hub monitors +* Azure Stream Analytics monitors +* Azure Storage monitors +* Azure IOT Hub monitors +* Azure API Management monitors + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| apimanagement_failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no | +| apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no | +| apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no | +| apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no | +| appservices_http_2xx_status_rate_limit | | string | `30` | no | +| appservices_http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | +| appservices_http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | +| appservices_http_404_errors_count_rate_limit | | string | `30` | no | +| appservices_http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | +| appservices_http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | +| appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | +| appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | +| appservices_response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | +| appservices_response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no | +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| environment | Architecture environment | string | - | yes | +| eventhub_errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no | +| eventhub_errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no | +| eventhub_failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no | +| eventhub_failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| iothub_dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no | +| iothub_dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no | +| iothub_failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | +| iothub_failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | +| iothub_failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | +| iothub_fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no | +| iothub_fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no | +| iothub_invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no | +| iothub_invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no | +| iothub_orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no | +| iothub_orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| non_taggable_filter_tags | Tags used for filtering for components without tag support | string | `*` | no | +| redis_evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no | +| redis_evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no | +| redis_percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no | +| redis_percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no | +| redis_server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no | +| redis_server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no | +| sqldatabase_cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | +| sqldatabase_cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `` | no | +| sqldatabase_deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no | +| sqldatabase_diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no | +| sqldatabase_diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no | +| sqldatabase_dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no | +| sqldatabase_dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no | +| storage_authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `15` | no | +| storage_availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `90` | no | +| storage_client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `15` | no | +| storage_latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `1000` | no | +| storage_network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `5` | no | +| storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `10` | no | +| storage_successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `90` | no | +| storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `10` | no | +| storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `5` | no | +| streamanalytics_conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no | +| streamanalytics_conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no | +| streamanalytics_failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no | +| streamanalytics_function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | +| streamanalytics_runtime_errors_threshold_critical | Runtime errors limit (critical threshold) | string | `10` | no | +| streamanalytics_runtime_errors_threshold_warning | Runtime errors limit (warning threshold) | string | `0` | no | +| streamanalytics_su_utilization_threshold_critical | Streaming Unit utilization rate limit (critical threshold) | string | `80` | no | +| streamanalytics_su_utilization_threshold_warning | Streaming Unit utilization rate limit (warning threshold) | string | `60` | no | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/) + +Azure metrics documentation: [https://docs.microsoft.com/en-us/azure/monitoring-and-diagnostics/monitoring-overview-metrics](https://docs.microsoft.com/en-us/azure/monitoring-and-diagnostics/monitoring-overview-metrics) diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf new file mode 100644 index 0000000..5526988 --- /dev/null +++ b/cloud/azure/inputs.tf @@ -0,0 +1,395 @@ +variable "environment" { + description = "Architecture environment" + type = "string" +} + +variable "message" { + description = "Message sent when a monitor is triggered" + type = "string" +} + +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "non_taggable_filter_tags" { + description = "Tags used for filtering for components without tag support" + default = "*" +} + +# Azure API Management specific variables +variable "apimanagement_failed_requests_threshold_critical" { + description = "Maximum acceptable percent of failed requests" + default = 5 +} + +variable "apimanagement_other_requests_threshold_critical" { + description = "Maximum acceptable percent of other requests" + default = 5 +} + +variable "apimanagement_unauthorized_requests_threshold_critical" { + description = "Maximum acceptable percent of unauthorized requests" + default = 5 +} + +variable "apimanagement_successful_requests_threshold_critical" { + description = "Minimum acceptable percent of successful requests" + default = 90 +} + +# Azure App Services specific variables +variable "appservices_response_time_threshold_critical" { + default = 0.8 + description = "Alerting threshold in seconds" +} + +variable "appservices_response_time_threshold_warning" { + default = 0.4 + description = "Warning threshold in seconds" +} + +variable "appservices_memory_usage_threshold_critical" { + default = 52430000 + description = "Alerting threshold in Mib" +} + +variable "appservices_memory_usage_threshold_warning" { + default = 33550000 + description = "Warning threshold in MiB" +} + +variable "appservices_http_404_errors_count_rate_limit" { + default = 30 +} + +variable "appservices_http_404_errors_count_rate_threshold_critical" { + default = 30 + description = "Alerting threshold (number of requests)" +} + +variable "appservices_http_404_errors_count_rate_threshold_warning" { + default = 10 + description = "Warning threshold (number of requests)" +} + +variable "appservices_http_2xx_status_rate_limit" { + default = 30 +} + +variable "appservices_http_2xx_status_rate_threshold_critical" { + default = 0.9 + description = "Alerting threshold (percentage)" +} + +variable "appservices_http_2xx_status_rate_threshold_warning" { + default = 0.95 + description = "Warning threshold (percentage)" +} + +# Azure Event Hub specific variables +variable "eventhub_failed_requests_rate_thresold_critical" { + description = "Failed requests ratio (percentage) to trigger the critical alert" + default = 3 +} + +variable "eventhub_failed_requests_rate_thresold_warning" { + description = "Failed requests ratio (percentage) to trigger a warning alert" + default = 1 +} + +variable "eventhub_errors_rate_thresold_critical" { + description = "Errors ratio (percentage) to trigger the critical alert" + default = 3 +} + +variable "eventhub_errors_rate_thresold_warning" { + description = "Errors ratio (percentage) to trigger a warning alert" + default = 1 +} + +# IOT Hub specific variables +variable "iothub_failed_jobs_rate_threshold_warning" { + description = "Jobs Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_jobs_rate_threshold_critical" { + description = "Jobs Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_failed_listjobs_rate_threshold_warning" { + description = "ListJobs Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_listjobs_rate_threshold_critical" { + description = "ListJobs Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_failed_queryjobs_rate_threshold_warning" { + description = "QueryJobs Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_queryjobs_rate_threshold_critical" { + description = "QueryJobs Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_failed_c2d_methods_rate_threshold_warning" { + description = "C2D Methods Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_c2d_methods_rate_threshold_critical" { + description = "C2D Methods Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_failed_c2d_twin_read_rate_threshold_warning" { + description = "C2D Twin Read Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_c2d_twin_read_rate_threshold_critical" { + description = "C2D Twin Read Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_failed_c2d_twin_update_rate_threshold_warning" { + description = "C2D Twin Update Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_c2d_twin_update_rate_threshold_critical" { + description = "C2D Twin Update Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_failed_d2c_twin_read_rate_threshold_warning" { + description = "D2C Twin Read Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_d2c_twin_read_rate_threshold_critical" { + description = "D2C Twin Read Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_failed_d2c_twin_update_rate_threshold_warning" { + description = "D2C Twin Update Failed rate limit (warning threshold)" + default = 0 +} + +variable "iothub_failed_d2c_twin_update_rate_threshold_critical" { + description = "D2C Twin Update Failed rate limit (critical threshold)" + default = 10 +} + +variable "iothub_dropped_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Dropped limit (warning threshold)" + default = 500 +} + +variable "iothub_dropped_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Dropped limit (critical threshold)" + default = 1000 +} + +variable "iothub_orphaned_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Orphaned limit (warning threshold)" + default = 500 +} + +variable "iothub_orphaned_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Orphaned limit (critical threshold)" + default = 1000 +} + +variable "iothub_invalid_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Invalid limit (warning threshold)" + default = 500 +} + +variable "iothub_invalid_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Invalid limit (critical threshold)" + default = 1000 +} + +variable "iothub_fallback_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Fallback limit (warning threshold)" + default = 500 +} + +variable "iothub_fallback_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Fallback limit (critical threshold)" + default = 1000 +} + +# Azure Redis specific variables +variable "redis_evictedkeys_limit_threshold_warning" { + description = "Evicted keys limit (warning threshold)" + default = 0 +} + +variable "redis_evictedkeys_limit_threshold_critical" { + description = "Evicted keys limit (critical threshold)" + default = 100 +} + +variable "redis_percent_processor_time_threshold_critical" { + description = "Processor time percent (critical threshold)" + default = 80 +} + +variable "redis_percent_processor_time_threshold_warning" { + description = "Processor time percent (warning threshold)" + default = 60 +} + +variable "redis_server_load_rate_threshold_critical" { + description = "Server CPU load rate (critical threshold)" + default = 90 +} + +variable "redis_server_load_rate_threshold_warning" { + description = "Server CPU load rate (warning threshold)" + default = 70 +} + +# Azure SQL Database specific variables +variable "sqldatabase_cpu_threshold_warning" { + description = "CPU usage in percent (warning threshold)" + default = "" +} + +variable "sqldatabase_cpu_threshold_critical" { + description = "CPU usage in percent (critical threshold)" + default = "90" +} + +variable "sqldatabase_diskspace_threshold_warning" { + description = "Disk space used in percent (warning threshold)" + default = "80" +} + +variable "sqldatabase_diskspace_threshold_critical" { + description = "Disk space used in percent (critical threshold)" + default = "90" +} + +variable "sqldatabase_dtu_threshold_warning" { + description = "Amount of DTU used (warning threshold)" + default = "85" +} + +variable "sqldatabase_dtu_threshold_critical" { + description = "Amount of DTU used (critical threshold)" + default = "90" +} + +variable "sqldatabase_deadlock_threshold_critical" { + description = "Amount of Deadlocks (critical threshold)" + default = "1" +} + +# Azure Storage specific variables +variable "storage_availability_threshold_critical" { + description = "Minimum acceptable percent of availability for a storage" + default = 90 +} + +variable "storage_successful_requests_threshold_critical" { + description = "Minimum acceptable percent of successful requests for a storage" + default = 90 +} + +variable "storage_latency_threshold_critical" { + description = "Maximum acceptable end to end latency (ms) for a storage" + default = 1000 +} + +variable "storage_timeout_error_requests_threshold_critical" { + description = "Maximum acceptable percent of timeout error requests for a storage" + default = 5 +} + +variable "storage_network_error_requests_threshold_critical" { + description = "Maximum acceptable percent of network error requests for a storage" + default = 5 +} + +variable "storage_throttling_error_requests_threshold_critical" { + description = "Maximum acceptable percent of throttling error requests for a storage" + default = 10 +} + +variable "storage_server_other_error_requests_threshold_critical" { + description = "Maximum acceptable percent of server other error requests for a storage" + default = 10 +} + +variable "storage_client_other_error_requests_threshold_critical" { + description = "Maximum acceptable percent of client other error requests for a storage" + default = 15 +} + +variable "storage_authorization_error_requests_threshold_critical" { + description = "Maximum acceptable percent of authorization error requests for a storage" + default = 15 +} + +# Azure Stream Analytics specific variables +variable "streamanalytics_su_utilization_threshold_warning" { + description = "Streaming Unit utilization rate limit (warning threshold)" + default = 60 +} + +variable "streamanalytics_su_utilization_threshold_critical" { + description = "Streaming Unit utilization rate limit (critical threshold)" + default = 80 +} + +variable "streamanalytics_function_requests_threshold_warning" { + description = "Failed Function Request rate limit (warning threshold)" + default = 0 +} + +variable "streamanalytics_failed_function_requests_threshold_critical" { + description = "Failed Function Request rate limit (critical threshold)" + default = 10 +} + +variable "streamanalytics_conversion_errors_threshold_warning" { + description = "Conversion errors limit (warning threshold)" + default = 0 +} + +variable "streamanalytics_conversion_errors_threshold_critical" { + description = "Conversion errors limit (critical threshold)" + default = 10 +} + +variable "streamanalytics_runtime_errors_threshold_warning" { + description = "Runtime errors limit (warning threshold)" + default = 0 +} + +variable "streamanalytics_runtime_errors_threshold_critical" { + description = "Runtime errors limit (critical threshold)" + default = 10 +} diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md index 5187715..e594a65 100644 --- a/cloud/azure/iothubs/README.md +++ b/cloud/azure/iothubs/README.md @@ -60,8 +60,7 @@ Inputs | failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | | fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no | | fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no | -| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | -| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| filter_tags | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no | | invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no | | message | Message sent when an alert is triggered | string | - | yes | diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 1b1348f..68c9965 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -14,14 +14,9 @@ variable "message" { description = "Message sent when an alert is triggered" } -variable "filter_tags_use_defaults" { - description = "Use default filter tags convention" - default = "true" -} - -variable "filter_tags_custom" { - description = "Tags used for custom filtering when filter_tags_use_defaults is false" - default = "*" +variable "filter_tags" { + description = "Tags used for filtering" + default = "*" } # Azure IOT hubs specific diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 9388f1c..5040c58 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -1,20 +1,12 @@ -data "template_file" "filter" { - template = "$${filter}" - - vars { - filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_iothub:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" - } -} - resource "datadog_monitor" "too_many_jobs_failed" { name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}" message = "${var.message}" query = < ${var.failed_jobs_rate_threshold_critical} EOF @@ -45,9 +37,9 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { query = < ${var.failed_listjobs_rate_threshold_critical} EOF @@ -78,9 +70,9 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { query = < ${var.failed_queryjobs_rate_threshold_critical} EOF @@ -110,7 +102,7 @@ resource "datadog_monitor" "status" { message = "${var.message}" query = < ${var.failed_c2d_methods_rate_threshold_critical} EOF @@ -192,9 +184,9 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { query = < ${var.failed_c2d_twin_read_rate_threshold_critical} EOF @@ -225,9 +217,9 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { query = < ${var.failed_c2d_twin_update_rate_threshold_critical} EOF @@ -258,9 +250,9 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { query = < ${var.failed_d2c_twin_read_rate_threshold_critical} EOF @@ -291,9 +283,9 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { query = < ${var.failed_d2c_twin_update_rate_threshold_critical} EOF @@ -324,7 +316,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { query = < ${var.dropped_d2c_telemetry_egress_threshold_critical} EOF @@ -355,7 +347,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { query = < ${var.orphaned_d2c_telemetry_egress_threshold_critical} EOF @@ -386,7 +378,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { query = < ${var.invalid_d2c_telemetry_egress_threshold_critical} EOF @@ -417,7 +409,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { query = < ${var.fallback_d2c_telemetry_egress_threshold_critical} EOF @@ -448,8 +440,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { query = < 0 EOF diff --git a/cloud/azure/monitors.tf b/cloud/azure/monitors.tf new file mode 100644 index 0000000..356d6c9 --- /dev/null +++ b/cloud/azure/monitors.tf @@ -0,0 +1,166 @@ +module "apimanagement" { + source = "./apimanagement" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + + failed_requests_threshold_critical = "${var.apimanagement_failed_requests_threshold_critical}" + other_requests_threshold_critical = "${var.apimanagement_other_requests_threshold_critical}" + successful_requests_threshold_critical = "${var.apimanagement_successful_requests_threshold_critical}" + unauthorized_requests_threshold_critical = "${var.apimanagement_unauthorized_requests_threshold_critical}" +} + +module "appservices" { + source = "./app-services" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + + http_2xx_status_rate_limit = "${var.appservices_http_2xx_status_rate_limit}" + http_2xx_status_rate_threshold_critical = "${var.appservices_http_2xx_status_rate_threshold_critical}" + http_2xx_status_rate_threshold_warning = "${var.appservices_http_2xx_status_rate_threshold_warning}" + http_404_errors_count_rate_limit = "${var.appservices_http_404_errors_count_rate_limit}" + http_404_errors_count_rate_threshold_critical = "${var.appservices_http_404_errors_count_rate_threshold_critical}" + http_404_errors_count_rate_threshold_warning = "${var.appservices_http_404_errors_count_rate_threshold_warning}" + memory_usage_threshold_critical = "${var.appservices_memory_usage_threshold_critical}" + memory_usage_threshold_warning = "${var.appservices_memory_usage_threshold_warning}" + response_time_threshold_critical = "${var.appservices_response_time_threshold_critical}" + response_time_threshold_warning = "${var.appservices_response_time_threshold_warning}" +} + +module "eventhub" { + source = "./eventhub" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + + errors_rate_thresold_critical = "${var.eventhub_errors_rate_thresold_critical}" + errors_rate_thresold_warning = "${var.eventhub_errors_rate_thresold_warning}" + failed_requests_rate_thresold_critical = "${var.eventhub_failed_requests_rate_thresold_critical}" + failed_requests_rate_thresold_warning = "${var.eventhub_failed_requests_rate_thresold_warning}" +} + +module "iothub" { + source = "./iothubs" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags = "${var.non_taggable_filter_tags}" + + dropped_d2c_telemetry_egress_threshold_critical = "${var.iothub_dropped_d2c_telemetry_egress_threshold_critical}" + dropped_d2c_telemetry_egress_threshold_warning = "${var.iothub_dropped_d2c_telemetry_egress_threshold_warning}" + failed_c2d_methods_rate_threshold_critical = "${var.iothub_failed_c2d_methods_rate_threshold_critical}" + failed_c2d_methods_rate_threshold_warning = "${var.iothub_failed_c2d_methods_rate_threshold_warning}" + failed_c2d_twin_read_rate_threshold_critical = "${var.iothub_failed_c2d_twin_read_rate_threshold_critical}" + failed_c2d_twin_read_rate_threshold_warning = "${var.iothub_failed_c2d_twin_read_rate_threshold_warning}" + failed_c2d_twin_update_rate_threshold_critical = "${var.iothub_failed_c2d_twin_update_rate_threshold_critical}" + failed_c2d_twin_update_rate_threshold_warning = "${var.iothub_failed_c2d_twin_update_rate_threshold_warning}" + failed_d2c_twin_read_rate_threshold_critical = "${var.iothub_failed_d2c_twin_read_rate_threshold_critical}" + failed_d2c_twin_read_rate_threshold_warning = "${var.iothub_failed_d2c_twin_read_rate_threshold_warning}" + failed_d2c_twin_update_rate_threshold_critical = "${var.iothub_failed_d2c_twin_update_rate_threshold_critical}" + failed_d2c_twin_update_rate_threshold_warning = "${var.iothub_failed_d2c_twin_update_rate_threshold_warning}" + failed_jobs_rate_threshold_critical = "${var.iothub_failed_jobs_rate_threshold_critical}" + failed_jobs_rate_threshold_warning = "${var.iothub_failed_jobs_rate_threshold_warning}" + failed_listjobs_rate_threshold_critical = "${var.iothub_failed_listjobs_rate_threshold_critical}" + failed_listjobs_rate_threshold_warning = "${var.iothub_failed_listjobs_rate_threshold_warning}" + failed_queryjobs_rate_threshold_critical = "${var.iothub_failed_queryjobs_rate_threshold_critical}" + failed_queryjobs_rate_threshold_warning = "${var.iothub_failed_queryjobs_rate_threshold_warning}" + fallback_d2c_telemetry_egress_threshold_critical = "${var.iothub_fallback_d2c_telemetry_egress_threshold_critical}" + fallback_d2c_telemetry_egress_threshold_warning = "${var.iothub_fallback_d2c_telemetry_egress_threshold_warning}" + invalid_d2c_telemetry_egress_threshold_critical = "${var.iothub_invalid_d2c_telemetry_egress_threshold_critical}" + invalid_d2c_telemetry_egress_threshold_warning = "${var.iothub_invalid_d2c_telemetry_egress_threshold_warning}" + orphaned_d2c_telemetry_egress_threshold_critical = "${var.iothub_orphaned_d2c_telemetry_egress_threshold_critical}" + orphaned_d2c_telemetry_egress_threshold_warning = "${var.iothub_orphaned_d2c_telemetry_egress_threshold_warning}" +} + +module "redis" { + source = "./redis" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + + evictedkeys_limit_threshold_critical = "${var.redis_evictedkeys_limit_threshold_critical}" + evictedkeys_limit_threshold_warning = "${var.redis_evictedkeys_limit_threshold_warning}" + percent_processor_time_threshold_critical = "${var.redis_percent_processor_time_threshold_critical}" + percent_processor_time_threshold_warning = "${var.redis_percent_processor_time_threshold_warning}" + server_load_rate_threshold_critical = "${var.redis_server_load_rate_threshold_critical}" + server_load_rate_threshold_warning = "${var.redis_server_load_rate_threshold_warning}" +} + +module "sqldatabase" { + source = "./sql-database" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + + cpu_threshold_critical = "${var.sqldatabase_cpu_threshold_critical}" + cpu_threshold_warning = "${var.sqldatabase_cpu_threshold_warning}" + deadlock_threshold_critical = "${var.sqldatabase_deadlock_threshold_critical}" + diskspace_threshold_critical = "${var.sqldatabase_diskspace_threshold_critical}" + diskspace_threshold_warning = "${var.sqldatabase_diskspace_threshold_warning}" + dtu_threshold_critical = "${var.sqldatabase_dtu_threshold_critical}" + dtu_threshold_warning = "${var.sqldatabase_dtu_threshold_warning}" +} + +module "storage" { + source = "./storage" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + + authorization_error_requests_threshold_critical = "${var.storage_authorization_error_requests_threshold_critical}" + availability_threshold_critical = "${var.storage_availability_threshold_critical}" + client_other_error_requests_threshold_critical = "${var.storage_client_other_error_requests_threshold_critical}" + latency_threshold_critical = "${var.storage_latency_threshold_critical}" + network_error_requests_threshold_critical = "${var.storage_network_error_requests_threshold_critical}" + server_other_error_requests_threshold_critical = "${var.storage_server_other_error_requests_threshold_critical}" + successful_requests_threshold_critical = "${var.storage_successful_requests_threshold_critical}" + throttling_error_requests_threshold_critical = "${var.storage_throttling_error_requests_threshold_critical}" + timeout_error_requests_threshold_critical = "${var.storage_timeout_error_requests_threshold_critical}" +} + +module "streamanalytics" { + source = "./stream-analytics" + + environment = "${var.environment}" + message = "${var.message}" + delay = "${var.delay}" + + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + + conversion_errors_threshold_critical = "${var.streamanalytics_conversion_errors_threshold_critical}" + conversion_errors_threshold_warning = "${var.streamanalytics_conversion_errors_threshold_warning}" + failed_function_requests_threshold_critical = "${var.streamanalytics_failed_function_requests_threshold_critical}" + function_requests_threshold_warning = "${var.streamanalytics_function_requests_threshold_warning}" + runtime_errors_threshold_critical = "${var.streamanalytics_runtime_errors_threshold_critical}" + runtime_errors_threshold_warning = "${var.streamanalytics_runtime_errors_threshold_warning}" + su_utilization_threshold_critical = "${var.streamanalytics_su_utilization_threshold_critical}" + su_utilization_threshold_warning = "${var.streamanalytics_su_utilization_threshold_warning}" +} From 2680f12280644c369c322246c96dbf8933325247 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Fri, 15 Dec 2017 17:04:33 +0100 Subject: [PATCH 92/93] MON-73 Update app-services monitors input mapping --- cloud/azure/README.md | 12 ++++++------ cloud/azure/inputs.tf | 38 ++++++++++++++++++++------------------ cloud/azure/monitors.tf | 20 ++++++++++---------- 3 files changed, 36 insertions(+), 34 deletions(-) diff --git a/cloud/azure/README.md b/cloud/azure/README.md index 5dcff20..124fd89 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -35,12 +35,12 @@ Inputs | apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no | | apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no | | apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no | -| appservices_http_2xx_status_rate_limit | | string | `30` | no | -| appservices_http_2xx_status_rate_threshold_critical | Alerting threshold (percentage) | string | `0.9` | no | -| appservices_http_2xx_status_rate_threshold_warning | Warning threshold (percentage) | string | `0.95` | no | -| appservices_http_404_errors_count_rate_limit | | string | `30` | no | -| appservices_http_404_errors_count_rate_threshold_critical | Alerting threshold (number of requests) | string | `30` | no | -| appservices_http_404_errors_count_rate_threshold_warning | Warning threshold (number of requests) | string | `10` | no | +| appservices_http_2xx_requests_threshold_critical | Minimum critical acceptable percent of 2xx requests | string | `90` | no | +| appservices_http_2xx_requests_threshold_warning | Minimum warning acceptable percent of 2xx requests | string | `95` | no | +| appservices_http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no | +| appservices_http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no | +| appservices_http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no | +| appservices_http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no | | appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no | | appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no | | appservices_response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no | diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf index 5526988..cf4510a 100644 --- a/cloud/azure/inputs.tf +++ b/cloud/azure/inputs.tf @@ -70,32 +70,34 @@ variable "appservices_memory_usage_threshold_warning" { description = "Warning threshold in MiB" } -variable "appservices_http_404_errors_count_rate_limit" { - default = 30 -} - -variable "appservices_http_404_errors_count_rate_threshold_critical" { +variable "appservices_http_4xx_requests_threshold_critical" { default = 30 - description = "Alerting threshold (number of requests)" + description = "Maximum critical acceptable percent of 4xx errors" } -variable "appservices_http_404_errors_count_rate_threshold_warning" { +variable "appservices_http_4xx_requests_threshold_warning" { + default = 15 + description = "Maximum warning acceptable percent of 4xx errors" +} + +variable "appservices_http_5xx_requests_threshold_critical" { + default = 20 + description = "Maximum critical acceptable percent of 5xx errors" +} + +variable "appservices_http_5xx_requests_threshold_warning" { default = 10 - description = "Warning threshold (number of requests)" + description = "Maximum warning acceptable percent of 5xx errors" } -variable "appservices_http_2xx_status_rate_limit" { - default = 30 +variable "appservices_http_2xx_requests_threshold_critical" { + default = 90 + description = "Minimum critical acceptable percent of 2xx requests" } -variable "appservices_http_2xx_status_rate_threshold_critical" { - default = 0.9 - description = "Alerting threshold (percentage)" -} - -variable "appservices_http_2xx_status_rate_threshold_warning" { - default = 0.95 - description = "Warning threshold (percentage)" +variable "appservices_http_2xx_requests_threshold_warning" { + default = 95 + description = "Minimum warning acceptable percent of 2xx requests" } # Azure Event Hub specific variables diff --git a/cloud/azure/monitors.tf b/cloud/azure/monitors.tf index 356d6c9..fc9aeee 100644 --- a/cloud/azure/monitors.tf +++ b/cloud/azure/monitors.tf @@ -24,16 +24,16 @@ module "appservices" { filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_custom = "${var.filter_tags_custom}" - http_2xx_status_rate_limit = "${var.appservices_http_2xx_status_rate_limit}" - http_2xx_status_rate_threshold_critical = "${var.appservices_http_2xx_status_rate_threshold_critical}" - http_2xx_status_rate_threshold_warning = "${var.appservices_http_2xx_status_rate_threshold_warning}" - http_404_errors_count_rate_limit = "${var.appservices_http_404_errors_count_rate_limit}" - http_404_errors_count_rate_threshold_critical = "${var.appservices_http_404_errors_count_rate_threshold_critical}" - http_404_errors_count_rate_threshold_warning = "${var.appservices_http_404_errors_count_rate_threshold_warning}" - memory_usage_threshold_critical = "${var.appservices_memory_usage_threshold_critical}" - memory_usage_threshold_warning = "${var.appservices_memory_usage_threshold_warning}" - response_time_threshold_critical = "${var.appservices_response_time_threshold_critical}" - response_time_threshold_warning = "${var.appservices_response_time_threshold_warning}" + http_2xx_requests_threshold_critical = "${var.appservices_http_2xx_requests_threshold_critical}" + http_2xx_requests_threshold_warning = "${var.appservices_http_2xx_requests_threshold_warning}" + http_5xx_requests_threshold_critical = "${var.appservices_http_5xx_requests_threshold_critical}" + http_5xx_requests_threshold_warning = "${var.appservices_http_5xx_requests_threshold_warning}" + http_4xx_requests_threshold_critical = "${var.appservices_http_4xx_requests_threshold_critical}" + http_4xx_requests_threshold_warning = "${var.appservices_http_4xx_requests_threshold_warning}" + memory_usage_threshold_critical = "${var.appservices_memory_usage_threshold_critical}" + memory_usage_threshold_warning = "${var.appservices_memory_usage_threshold_warning}" + response_time_threshold_critical = "${var.appservices_response_time_threshold_critical}" + response_time_threshold_warning = "${var.appservices_response_time_threshold_warning}" } module "eventhub" { From a3f7795ceb0120bb1e75e911773c686ec24968bc Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Fri, 15 Dec 2017 17:27:39 +0100 Subject: [PATCH 93/93] MON-73 Fix some default values and README files --- cloud/azure/README.md | 2 +- cloud/azure/inputs.tf | 2 +- cloud/azure/sql-database/README.md | 2 +- cloud/azure/sql-database/inputs.tf | 2 +- cloud/azure/storage/README.md | 22 +++++++++++----------- cloud/azure/stream-analytics/README.md | 18 ++++++++---------- 6 files changed, 23 insertions(+), 25 deletions(-) diff --git a/cloud/azure/README.md b/cloud/azure/README.md index 124fd89..5d0cac8 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -86,7 +86,7 @@ Inputs | redis_server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no | | redis_server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no | | sqldatabase_cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | -| sqldatabase_cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `` | no | +| sqldatabase_cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no | | sqldatabase_deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no | | sqldatabase_diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no | | sqldatabase_diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no | diff --git a/cloud/azure/inputs.tf b/cloud/azure/inputs.tf index cf4510a..775fc3e 100644 --- a/cloud/azure/inputs.tf +++ b/cloud/azure/inputs.tf @@ -276,7 +276,7 @@ variable "redis_server_load_rate_threshold_warning" { # Azure SQL Database specific variables variable "sqldatabase_cpu_threshold_warning" { description = "CPU usage in percent (warning threshold)" - default = "" + default = "80" } variable "sqldatabase_cpu_threshold_critical" { diff --git a/cloud/azure/sql-database/README.md b/cloud/azure/sql-database/README.md index 7d815e3..8f42bde 100644 --- a/cloud/azure/sql-database/README.md +++ b/cloud/azure/sql-database/README.md @@ -28,7 +28,7 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| | cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | -| cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `` | no | +| cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no | | deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no | diff --git a/cloud/azure/sql-database/inputs.tf b/cloud/azure/sql-database/inputs.tf index 9ddab06..aa81cfb 100644 --- a/cloud/azure/sql-database/inputs.tf +++ b/cloud/azure/sql-database/inputs.tf @@ -28,7 +28,7 @@ variable "filter_tags_custom" { variable "cpu_threshold_warning" { description = "CPU usage in percent (warning threshold)" - default = "" + default = "80" } variable "cpu_threshold_critical" { diff --git a/cloud/azure/storage/README.md b/cloud/azure/storage/README.md index 0849152..7702683 100644 --- a/cloud/azure/storage/README.md +++ b/cloud/azure/storage/README.md @@ -32,20 +32,20 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| +| authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `15` | no | +| availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `90` | no | +| client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `15` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| message | Message sent when a monitor is triggered | string | - | yes | -| filter_tags_use_defaults | Use default tagging convention | string | `true` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | -| availability_threshold_critical | Minimum threshold of availability | string | `90` | no | -| successful_requests_threshold_critical | Minimum threshold of successful requests | string | `90` | no | -| latency_threshold_critical | Maximum threshold of latency in ms | string | `1000` | no | -| timeout_error_requests_threshold_critical | Maximum threshold of timeout error requests in percent | string | `35` | no | -| network_error_requests_threshold_critical | Maximum threshold of network error requests in percent | string | `35` | no | -| throttling_error_requests_threshold_critical | Maximum threshold of throttling error requests in percent | string | `50` | no | -| server_other_error_requests_threshold_critical | Maximum threshold of server other error requests in percent | string | `50` | no | -| client_other_error_requests_threshold_critical | Maximum threshold of client other error requests in percent | string | `75` | no | -| authorization_error_requests_threshold_critical | Maximum threshold of authorization error requests in percent | string | `75` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `1000` | no | +| message | Message sent when a Redis monitor is triggered | string | - | yes | +| network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `5` | no | +| server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `10` | no | +| successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `90` | no | +| throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `10` | no | +| timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `5` | no | Related documentation --------------------- diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md index dca299b..53422c8 100644 --- a/cloud/azure/stream-analytics/README.md +++ b/cloud/azure/stream-analytics/README.md @@ -23,17 +23,15 @@ Inputs | conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no | +| failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | -| message | Message sent when a monitor is triggered | string | - | yes | -| provider | What is the monitored provider | string | azure | no | -| runtime_errors_threshold_critical | | string | `10` | no | -| runtime_errors_threshold_warning | | string | `0` | no | -| su_utilization_threshold_critical | | string | `80` | no | -| su_utilization_threshold_warning | Monitor specific | string | `60` | no | -| service | What is the monitored service | string | storage | no | -| subscription_id | Azure account id used as filter for monitors | string | - | yes | -| use_filter_tags | Filter the data with service tags if true | string | `true` | no | +| message | Message sent when a Redis monitor is triggered | string | - | yes | +| runtime_errors_threshold_critical | Runtime errors limit (critical threshold) | string | `10` | no | +| runtime_errors_threshold_warning | Runtime errors limit (warning threshold) | string | `0` | no | +| su_utilization_threshold_critical | Streaming Unit utilization rate limit (critical threshold) | string | `80` | no | +| su_utilization_threshold_warning | Streaming Unit utilization rate limit (warning threshold) | string | `60` | no | Related documentation ---------------------