From e4e929ec1d6bb380da5eed54bd324ab33fc513ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 12:47:19 +0100 Subject: [PATCH 01/19] MON-78 Add datadog monitor for stream analytics --- cloud/azure/stream-analytics/inputs.tf | 44 +++++++++ .../monitors-stream-analytics.tf | 92 +++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 cloud/azure/stream-analytics/inputs.tf create mode 100644 cloud/azure/stream-analytics/monitors-stream-analytics.tf diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf new file mode 100644 index 0000000..e9bc507 --- /dev/null +++ b/cloud/azure/stream-analytics/inputs.tf @@ -0,0 +1,44 @@ +variable "hno_escalation_group" {} +variable "ho_escalation_group" {} + +variable "environment" {} + +variable "notify_no_data" { + default = "false" +} + +variable "delay" { + default = "600" +} + +variable "su_utilization_warning" { + default = 60 +} + +variable "su_utilization_critical" { + default = 80 +} + +variable "failed_function_requests_warning" { + default = 0 +} + +variable "failed_function_requests_critical" { + default = 10 +} + +variable "conversion_errors_warning" { + default = 0 +} + +variable "conversion_errors_critical" { + default = 10 +} + +variable "runtime_errors_warning" { + default = 0 +} + +variable "runtime_errors_critical" { + default = 0 +} diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf new file mode 100644 index 0000000..f18d7f1 --- /dev/null +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -0,0 +1,92 @@ +resource "datadog_monitor" "SU_utilization" { + name = "[${var.environment} SU utilization at more than ${var.su_utilization_critical}% on {{host.identifier}}]" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{*} by {name,resource_group} > ${var.su_utilization_critical}" + type = "query alert" + + notify_no_data = "${var.notify_no_data}" + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.su_utilization_warning}" + critical = "${var.su_utilization_critical}" + } +} + +resource "datadog_monitor" "failed_function_requests" { + name = "[${var.environment} More than ${var.failed_function_requests_critical} failed function requests on {{host.identifier}}]" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}" + type = "query alert" + + notify_no_data = "${var.notify_no_data}" + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.failed_function_requests_warning}" + critical = "${var.failed_function_requests_critical}" + } +} + +resource "datadog_monitor" "conversion_errors" { + name = "[${var.environment} More than ${var.conversion_errors_critical} conversion errors on {{host.identifier}}]" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}" + type = "query alert" + + notify_no_data = "${var.notify_no_data}" + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.conversion_errors_warning}" + critical = "${var.conversion_errors_critical}" + } +} + +resource "datadog_monitor" "runtime_errors" { + name = "[${var.environment} More than ${var.runtime_errors_critical} runtime errors on {{host.identifier}}]" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}" + type = "query alert" + + notify_no_data = "${var.notify_no_data}" + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.runtime_errors_warning}" + critical = "${var.runtime_errors_critical}" + } +} + From 17fa260daf594ab65043310053f0f534d49bff7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 13:08:17 +0100 Subject: [PATCH 02/19] MON-78 Corrected bad warning value for runtime_errors --- cloud/azure/stream-analytics/inputs.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index e9bc507..4ea5ee6 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -40,5 +40,5 @@ variable "runtime_errors_warning" { } variable "runtime_errors_critical" { - default = 0 + default = 10 } From 34ef735a076884ef27474431a0df695b9228858e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 14:18:15 +0100 Subject: [PATCH 03/19] MON-78: Changed host.identifier for name to identify the streamanalytics obkect with issues --- .../azure/stream-analytics/monitors-stream-analytics.tf | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index f18d7f1..ea2920f 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -1,5 +1,5 @@ resource "datadog_monitor" "SU_utilization" { - name = "[${var.environment} SU utilization at more than ${var.su_utilization_critical}% on {{host.identifier}}]" + name = "[${var.environment}] SU utilization at more than ${var.su_utilization_critical}% on {{name}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{*} by {name,resource_group} > ${var.su_utilization_critical}" @@ -22,7 +22,7 @@ resource "datadog_monitor" "SU_utilization" { } resource "datadog_monitor" "failed_function_requests" { - name = "[${var.environment} More than ${var.failed_function_requests_critical} failed function requests on {{host.identifier}}]" + name = "[${var.environment}] More than ${var.failed_function_requests_critical} failed function requests on {{name}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}" @@ -45,7 +45,7 @@ resource "datadog_monitor" "failed_function_requests" { } resource "datadog_monitor" "conversion_errors" { - name = "[${var.environment} More than ${var.conversion_errors_critical} conversion errors on {{host.identifier}}]" + name = "[${var.environment}] More than ${var.conversion_errors_critical} conversion errors on {{name}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}" @@ -68,7 +68,7 @@ resource "datadog_monitor" "conversion_errors" { } resource "datadog_monitor" "runtime_errors" { - name = "[${var.environment} More than ${var.runtime_errors_critical} runtime errors on {{host.identifier}}]" + name = "[${var.environment}] More than ${var.runtime_errors_critical} runtime errors on {{name}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}" @@ -89,4 +89,3 @@ resource "datadog_monitor" "runtime_errors" { critical = "${var.runtime_errors_critical}" } } - From 51b3b5010da96533a605c94f2d9e6d44ea05f495 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 15:22:52 +0100 Subject: [PATCH 04/19] MON-78 Changed variable names --- cloud/azure/stream-analytics/inputs.tf | 8 ++++++-- .../stream-analytics/monitors-stream-analytics.tf | 10 +++++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index 4ea5ee6..529e669 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -1,5 +1,5 @@ -variable "hno_escalation_group" {} -variable "ho_escalation_group" {} +variable "critical_escalation_group" {} +variable "warning_escalation_group" {} variable "environment" {} @@ -7,6 +7,10 @@ variable "notify_no_data" { default = "false" } +variable "filter_tags" { + default = "*" +} + variable "delay" { default = "600" } diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index ea2920f..4e64044 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -1,8 +1,8 @@ resource "datadog_monitor" "SU_utilization" { name = "[${var.environment}] SU utilization at more than ${var.su_utilization_critical}% on {{name}}" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{*} by {name,resource_group} > ${var.su_utilization_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -23,7 +23,7 @@ resource "datadog_monitor" "SU_utilization" { resource "datadog_monitor" "failed_function_requests" { name = "[${var.environment}] More than ${var.failed_function_requests_critical} failed function requests on {{name}}" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}" type = "query alert" @@ -46,7 +46,7 @@ resource "datadog_monitor" "failed_function_requests" { resource "datadog_monitor" "conversion_errors" { name = "[${var.environment}] More than ${var.conversion_errors_critical} conversion errors on {{name}}" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}" type = "query alert" @@ -69,7 +69,7 @@ resource "datadog_monitor" "conversion_errors" { resource "datadog_monitor" "runtime_errors" { name = "[${var.environment}] More than ${var.runtime_errors_critical} runtime errors on {{name}}" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}" type = "query alert" From 54a90b3972a2a2a374f5a5726350f38ad2fdf52d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 15:34:57 +0100 Subject: [PATCH 05/19] MON-78 Removed upper case resource name --- cloud/azure/stream-analytics/monitors-stream-analytics.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 4e64044..68043f8 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -1,4 +1,4 @@ -resource "datadog_monitor" "SU_utilization" { +resource "datadog_monitor" "su_utilization" { name = "[${var.environment}] SU utilization at more than ${var.su_utilization_critical}% on {{name}}" message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" From 9261bde1588268650f9f1295489daf756257ff8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 15:51:00 +0100 Subject: [PATCH 06/19] MON-78: Remove escalation variables, add message variable --- cloud/azure/stream-analytics/inputs.tf | 3 +-- cloud/azure/stream-analytics/monitors-stream-analytics.tf | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index 529e669..d240169 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -1,5 +1,4 @@ -variable "critical_escalation_group" {} -variable "warning_escalation_group" {} +variable "message" {} variable "environment" {} diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 68043f8..6cf42c5 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -1,6 +1,6 @@ resource "datadog_monitor" "su_utilization" { name = "[${var.environment}] SU utilization at more than ${var.su_utilization_critical}% on {{name}}" - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.message}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_critical}" type = "query alert" @@ -23,7 +23,7 @@ resource "datadog_monitor" "su_utilization" { resource "datadog_monitor" "failed_function_requests" { name = "[${var.environment}] More than ${var.failed_function_requests_critical} failed function requests on {{name}}" - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.message}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}" type = "query alert" @@ -46,7 +46,7 @@ resource "datadog_monitor" "failed_function_requests" { resource "datadog_monitor" "conversion_errors" { name = "[${var.environment}] More than ${var.conversion_errors_critical} conversion errors on {{name}}" - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.message}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}" type = "query alert" @@ -69,7 +69,7 @@ resource "datadog_monitor" "conversion_errors" { resource "datadog_monitor" "runtime_errors" { name = "[${var.environment}] More than ${var.runtime_errors_critical} runtime errors on {{name}}" - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.message}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}" type = "query alert" From 0b03cade41951578a3f6363b0733d31eee4e93e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 16:35:44 +0100 Subject: [PATCH 07/19] MON-78 Changing naming convention for variables --- cloud/azure/stream-analytics/inputs.tf | 16 +++++----- .../monitors-stream-analytics.tf | 32 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index d240169..2d0619a 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -14,34 +14,34 @@ variable "delay" { default = "600" } -variable "su_utilization_warning" { +variable "su_utilization_threshold_warning" { default = 60 } -variable "su_utilization_critical" { +variable "su_utilization_threshold_critical" { default = 80 } -variable "failed_function_requests_warning" { +variable "function_requests_threshold_warning" { default = 0 } -variable "failed_function_requests_critical" { +variable "function_requests_threshold_critical" { default = 10 } -variable "conversion_errors_warning" { +variable "conversion_errors_threshold_warning" { default = 0 } -variable "conversion_errors_critical" { +variable "conversion_errors_threshold_critical" { default = 10 } -variable "runtime_errors_warning" { +variable "runtime_errors_threshold_warning" { default = 0 } -variable "runtime_errors_critical" { +variable "runtime_errors_threshold_critical" { default = 10 } diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 6cf42c5..55ac674 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -1,8 +1,8 @@ resource "datadog_monitor" "su_utilization" { - name = "[${var.environment}] SU utilization at more than ${var.su_utilization_critical}% on {{name}}" + name = "[${var.environment}] SU utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -16,16 +16,16 @@ resource "datadog_monitor" "su_utilization" { new_host_delay = "${var.delay}" no_data_timeframe = 20 thresholds { - warning = "${var.su_utilization_warning}" - critical = "${var.su_utilization_critical}" + warning = "${var.su_utilization_threshold_warning}" + critical = "${var.su_utilization_threshold_critical}" } } resource "datadog_monitor" "failed_function_requests" { - name = "[${var.environment}] More than ${var.failed_function_requests_critical} failed function requests on {{name}}" + name = "[${var.environment}] More than ${var.function_requests_threshold_critical} failed function requests on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.function_requests_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -39,16 +39,16 @@ resource "datadog_monitor" "failed_function_requests" { new_host_delay = "${var.delay}" no_data_timeframe = 20 thresholds { - warning = "${var.failed_function_requests_warning}" - critical = "${var.failed_function_requests_critical}" + warning = "${var.function_requests_threshold_warning}" + critical = "${var.function_requests_threshold_critical}" } } resource "datadog_monitor" "conversion_errors" { - name = "[${var.environment}] More than ${var.conversion_errors_critical} conversion errors on {{name}}" + name = "[${var.environment}] More than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -62,16 +62,16 @@ resource "datadog_monitor" "conversion_errors" { new_host_delay = "${var.delay}" no_data_timeframe = 20 thresholds { - warning = "${var.conversion_errors_warning}" - critical = "${var.conversion_errors_critical}" + warning = "${var.conversion_errors_threshold_warning}" + critical = "${var.conversion_errors_threshold_critical}" } } resource "datadog_monitor" "runtime_errors" { - name = "[${var.environment}] More than ${var.runtime_errors_critical} runtime errors on {{name}}" + name = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -85,7 +85,7 @@ resource "datadog_monitor" "runtime_errors" { new_host_delay = "${var.delay}" no_data_timeframe = 20 thresholds { - warning = "${var.runtime_errors_warning}" - critical = "${var.runtime_errors_critical}" + warning = "${var.runtime_errors_threshold_warning}" + critical = "${var.runtime_errors_threshold_critical}" } } From 0706a50badd6a4b442fe3afa6ab82712197572b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 17:01:21 +0100 Subject: [PATCH 08/19] MON-78: Changed monitor name for better clarity --- cloud/azure/stream-analytics/monitors-stream-analytics.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 55ac674..ed4c51f 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -1,5 +1,5 @@ resource "datadog_monitor" "su_utilization" { - name = "[${var.environment}] SU utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" + name = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" message = "${var.message}" query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_threshold_critical}" From 1f059622ed932ee209847dff647d30abc19ebdd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 17:46:21 +0100 Subject: [PATCH 09/19] MON-78 Changed filter to reach proper resources --- cloud/azure/stream-analytics/inputs.tf | 4 ++-- cloud/azure/stream-analytics/monitors-stream-analytics.tf | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index 2d0619a..1c3ff2e 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -6,8 +6,8 @@ variable "notify_no_data" { default = "false" } -variable "filter_tags" { - default = "*" +variable "use_filter_tags" { + default = "true" } variable "delay" { diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index ed4c51f..6903b6a 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -2,7 +2,7 @@ resource "datadog_monitor" "su_utilization" { name = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.filter_tags}} by {name,resource_group} > ${var.su_utilization_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"} by {name,resource_group} > ${var.su_utilization_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -25,7 +25,7 @@ resource "datadog_monitor" "failed_function_requests" { name = "[${var.environment}] More than ${var.function_requests_threshold_critical} failed function requests on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.function_requests_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.function_requests_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -48,7 +48,7 @@ resource "datadog_monitor" "conversion_errors" { name = "[${var.environment}] More than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_stream_analytics:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.conversion_errors_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -71,7 +71,7 @@ resource "datadog_monitor" "runtime_errors" { name = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"} by {name,resource_group}} > ${var.runtime_errors_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" From aaabb129b5ae66cc9b2e2f940bac0fc7e9f8ee91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 17:52:40 +0100 Subject: [PATCH 10/19] MON-78 Forgot a } --- cloud/azure/stream-analytics/monitors-stream-analytics.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 6903b6a..e95825e 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -2,7 +2,7 @@ resource "datadog_monitor" "su_utilization" { name = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"} by {name,resource_group} > ${var.su_utilization_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.su_utilization_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" From 686765bcaa83e795f9608aad0f39c681e589477c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 18:00:06 +0100 Subject: [PATCH 11/19] MON-78 Corrected typo in query for runtime_errors --- cloud/azure/stream-analytics/monitors-stream-analytics.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index e95825e..6ca7717 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -71,7 +71,7 @@ resource "datadog_monitor" "runtime_errors" { name = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"} by {name,resource_group}} > ${var.runtime_errors_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.runtime_errors_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" From f916fbfc81ffdfe273eafc6bcab98432faf1b0f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 12:00:56 +0100 Subject: [PATCH 12/19] MON-78: Readme --- cloud/azure/stream-analytics/README.md | 39 ++++++++++++++++++++++++++ cloud/azure/stream-analytics/inputs.tf | 24 +++++++++++----- 2 files changed, 56 insertions(+), 7 deletions(-) create mode 100644 cloud/azure/stream-analytics/README.md diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md new file mode 100644 index 0000000..83d0af4 --- /dev/null +++ b/cloud/azure/stream-analytics/README.md @@ -0,0 +1,39 @@ +Azure Stream Analytics DataDog monitors +======================================= + +How to use this module +---------------------- + +``` +module "datadog-monitors-azure-redis" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/stream-analytics?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + + environment = "${var.environment}" +} +``` + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| conversion_errors_threshold_critical | | string | `10` | no | +| conversion_errors_threshold_warning | | string | `0` | no | +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| environment | Architecture environment | string | - | yes | +| function_requests_threshold_critical | | string | `10` | no | +| function_requests_threshold_warning | | string | `0` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| notify_no_data | | string | `false` | no | +| runtime_errors_threshold_critical | | string | `10` | no | +| runtime_errors_threshold_warning | | string | `0` | no | +| su_utilization_threshold_critical | | string | `80` | no | +| su_utilization_threshold_warning | Monitor specific | string | `60` | no | +| use_filter_tags | Filter the data with service tags if true | string | `true` | no | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/) diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index 1c3ff2e..29db469 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -1,19 +1,29 @@ -variable "message" {} +# Global Terraform +variable "environment" { + description = "Architecture environment" + type = "string" +} -variable "environment" {} +variable "message" { + description = "Message sent when a monitor is triggered" +} + +# Global DataDog +variable "use_filter_tags" { + description = "Filter the data with service tags if true" + default = "true" +} variable "notify_no_data" { default = "false" } -variable "use_filter_tags" { - default = "true" -} - variable "delay" { - default = "600" + description = "Delay in seconds for the metric evaluation" + default = 600 } +# Monitor specific variable "su_utilization_threshold_warning" { default = 60 } From 1a278fc81c90e853c0493132cd4f3e3f89858334 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Respaut?= Date: Tue, 31 Oct 2017 12:04:35 +0100 Subject: [PATCH 13/19] MON-78: Fixup use filter tag usage --- .../monitors-stream-analytics.tf | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 6ca7717..0972bd4 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -1,8 +1,16 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_stream_analytics:enabled,env:%s", var.environment) : "*"}" + } +} + resource "datadog_monitor" "su_utilization" { name = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.su_utilization_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.su_utilization_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -25,7 +33,7 @@ resource "datadog_monitor" "failed_function_requests" { name = "[${var.environment}] More than ${var.function_requests_threshold_critical} failed function requests on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.function_requests_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.function_requests_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -48,7 +56,7 @@ resource "datadog_monitor" "conversion_errors" { name = "[${var.environment}] More than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_stream_analytics:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.conversion_errors_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.conversion_errors_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -71,7 +79,7 @@ resource "datadog_monitor" "runtime_errors" { name = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_eventhub:enabled,env:%s", var.environment) : "*"}} by {name,resource_group} > ${var.runtime_errors_threshold_critical}" + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.runtime_errors_threshold_critical}" type = "query alert" notify_no_data = "${var.notify_no_data}" From 41997c9afe58583177acb7915036c5cd8cbdd910 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 31 Oct 2017 14:41:14 +0100 Subject: [PATCH 14/19] MON-78 Add EOF on querys --- .../monitors-stream-analytics.tf | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 0972bd4..8824410 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -10,7 +10,11 @@ resource "datadog_monitor" "su_utilization" { name = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.su_utilization_threshold_critical}" + query = < ${var.su_utilization_threshold_critical} + EOF type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -33,7 +37,11 @@ resource "datadog_monitor" "failed_function_requests" { name = "[${var.environment}] More than ${var.function_requests_threshold_critical} failed function requests on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.function_requests_threshold_critical}" + query = < ${var.function_requests_threshold_critical} + EOF type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -56,7 +64,11 @@ resource "datadog_monitor" "conversion_errors" { name = "[${var.environment}] More than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.conversion_errors_threshold_critical}" + query = < ${var.conversion_errors_threshold_critical} + EOF type = "query alert" notify_no_data = "${var.notify_no_data}" @@ -79,7 +91,11 @@ resource "datadog_monitor" "runtime_errors" { name = "[${var.environment}] More than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}" message = "${var.message}" - query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {name,resource_group} > ${var.runtime_errors_threshold_critical}" + query = < ${var.runtime_errors_threshold_critical} + EOF type = "query alert" notify_no_data = "${var.notify_no_data}" From 0b896d784b0db61fd975fe4876ca896e25c4c3ad Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 31 Oct 2017 15:41:41 +0100 Subject: [PATCH 15/19] MON-78 Add Stream Analytics on several names to be more specific --- cloud/azure/stream-analytics/monitors-stream-analytics.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 8824410..6e6f651 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -34,7 +34,7 @@ resource "datadog_monitor" "su_utilization" { } resource "datadog_monitor" "failed_function_requests" { - name = "[${var.environment}] More than ${var.function_requests_threshold_critical} failed function requests on {{name}}" + name = "[${var.environment}] Stream Analytics : More than ${var.function_requests_threshold_critical} failed function requests on {{name}}" message = "${var.message}" query = < Date: Fri, 3 Nov 2017 20:21:44 +0100 Subject: [PATCH 16/19] MON-78 add subscription_id and tags --- cloud/azure/stream-analytics/README.md | 3 +- cloud/azure/stream-analytics/inputs.tf | 33 +++++++++++++++---- .../monitors-stream-analytics.tf | 18 +++++++--- 3 files changed, 41 insertions(+), 13 deletions(-) diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md index 83d0af4..f115e70 100644 --- a/cloud/azure/stream-analytics/README.md +++ b/cloud/azure/stream-analytics/README.md @@ -9,8 +9,8 @@ module "datadog-monitors-azure-redis" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/stream-analytics?ref={revision}" message = "${module.datadog-message-alerting.alerting-message}" - environment = "${var.environment}" + subscription_id = "${var.subscription_id}" } ``` @@ -31,6 +31,7 @@ Inputs | runtime_errors_threshold_warning | | string | `0` | no | | su_utilization_threshold_critical | | string | `80` | no | | su_utilization_threshold_warning | Monitor specific | string | `60` | no | +| subscription_id | Azure account id used as filter for monitors | string | - | yes | | use_filter_tags | Filter the data with service tags if true | string | `true` | no | Related documentation diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index 29db469..8160547 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -8,14 +8,28 @@ variable "message" { description = "Message sent when a monitor is triggered" } -# Global DataDog -variable "use_filter_tags" { - description = "Filter the data with service tags if true" - default = "true" +variable "subscription_id" { + description = "Azure account id used as filter for monitors" + type = "string" } -variable "notify_no_data" { - default = "false" +variable "provider" { + description = "Cloud provider which the monitor and its based metric depend on" + type = "string" + default = "azure" +} + +variable "service" { + description = "Service monitored by this set of monitors" + type = "string" + default = "storage" +} + +# Global DataDog + + +variable "message" { + description = "Message sent when a Redis monitor is triggered" } variable "delay" { @@ -23,7 +37,12 @@ variable "delay" { default = 600 } -# Monitor specific +variable "use_filter_tags" { + description = "Filter the data with service tags if true" + default = "true" +} + +# Azure Stream Analytics specific variable "su_utilization_threshold_warning" { default = 60 } diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 6e6f651..e464dd4 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -2,12 +2,12 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_stream_analytics:enabled,env:%s", var.environment) : "*"}" + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}" } } resource "datadog_monitor" "su_utilization" { - name = "[${var.environment}] Streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" + name = "[${var.environment}] Stram Analytics streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}" message = "${var.message}" query = < Date: Fri, 3 Nov 2017 20:28:05 +0100 Subject: [PATCH 17/19] MON-78 add subscription_id and tags --- cloud/azure/stream-analytics/README.md | 9 ++++----- cloud/azure/stream-analytics/inputs.tf | 10 ++++++++-- .../stream-analytics/monitors-stream-analytics.tf | 11 +++++++---- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md index f115e70..28e3e2b 100644 --- a/cloud/azure/stream-analytics/README.md +++ b/cloud/azure/stream-analytics/README.md @@ -19,14 +19,13 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| conversion_errors_threshold_critical | | string | `10` | no | -| conversion_errors_threshold_warning | | string | `0` | no | +| conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no | +| conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | | environment | Architecture environment | string | - | yes | -| function_requests_threshold_critical | | string | `10` | no | -| function_requests_threshold_warning | | string | `0` | no | +| function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no | +| function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | | message | Message sent when a monitor is triggered | string | - | yes | -| notify_no_data | | string | `false` | no | | runtime_errors_threshold_critical | | string | `10` | no | | runtime_errors_threshold_warning | | string | `0` | no | | su_utilization_threshold_critical | | string | `80` | no | diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index 8160547..16807c8 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -26,8 +26,6 @@ variable "service" { } # Global DataDog - - variable "message" { description = "Message sent when a Redis monitor is triggered" } @@ -44,33 +42,41 @@ variable "use_filter_tags" { # Azure Stream Analytics specific variable "su_utilization_threshold_warning" { + description = "Streaming Unit utilization rate limit (warning threshold)" default = 60 } variable "su_utilization_threshold_critical" { + description = "Streaming Unit utilization rate limit (critical threshold)" default = 80 } variable "function_requests_threshold_warning" { + description = "Failed Function Request rate limit (warning threshold)" default = 0 } variable "function_requests_threshold_critical" { + description = "Failed Function Request rate limit (critical threshold)" default = 10 } variable "conversion_errors_threshold_warning" { + description = "Conversion errors limit (warning threshold)" default = 0 } variable "conversion_errors_threshold_critical" { + description = "Conversion errors limit (critical threshold)" default = 10 } variable "runtime_errors_threshold_warning" { + description = "Runtime errors limit (warning threshold)" default = 0 } variable "runtime_errors_threshold_critical" { + description = "Runtime errors limit (critical threshold)" default = 10 } diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index e464dd4..0ecb513 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -41,8 +41,9 @@ resource "datadog_monitor" "failed_function_requests" { query = < ${var.function_requests_threshold_critical} + avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() / + avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() + ) * 100 > ${var.function_requests_threshold_critical} EOF type = "query alert" @@ -66,7 +67,8 @@ resource "datadog_monitor" "failed_function_requests" { resource "datadog_monitor" "conversion_errors" { name = "[${var.environment}] Stream Analytics more than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}" - message = "${var.message}" + # Hard Coded Message while we don't know how to configure warning and critical thresholds + message = "@FR-CloudPublic-run@fr.clara.net" query = < Date: Fri, 3 Nov 2017 20:51:18 +0100 Subject: [PATCH 18/19] MON-78 update readme --- cloud/azure/stream-analytics/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cloud/azure/stream-analytics/README.md b/cloud/azure/stream-analytics/README.md index 28e3e2b..dca299b 100644 --- a/cloud/azure/stream-analytics/README.md +++ b/cloud/azure/stream-analytics/README.md @@ -26,10 +26,12 @@ Inputs | function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no | | function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no | | message | Message sent when a monitor is triggered | string | - | yes | +| provider | What is the monitored provider | string | azure | no | | runtime_errors_threshold_critical | | string | `10` | no | | runtime_errors_threshold_warning | | string | `0` | no | | su_utilization_threshold_critical | | string | `80` | no | | su_utilization_threshold_warning | Monitor specific | string | `60` | no | +| service | What is the monitored service | string | storage | no | | subscription_id | Azure account id used as filter for monitors | string | - | yes | | use_filter_tags | Filter the data with service tags if true | string | `true` | no | From 8afae8b5f44cf60a04a4a6c22e6da5414553a129 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Thu, 23 Nov 2017 15:51:23 +0100 Subject: [PATCH 19/19] MON-78 Normalize monitors & add status monitor --- cloud/azure/stream-analytics/inputs.tf | 32 +++------ .../monitors-stream-analytics.tf | 69 ++++++++++++------- 2 files changed, 53 insertions(+), 48 deletions(-) diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf index 16807c8..ae1186a 100644 --- a/cloud/azure/stream-analytics/inputs.tf +++ b/cloud/azure/stream-analytics/inputs.tf @@ -4,27 +4,6 @@ variable "environment" { type = "string" } -variable "message" { - description = "Message sent when a monitor is triggered" -} - -variable "subscription_id" { - description = "Azure account id used as filter for monitors" - type = "string" -} - -variable "provider" { - description = "Cloud provider which the monitor and its based metric depend on" - type = "string" - default = "azure" -} - -variable "service" { - description = "Service monitored by this set of monitors" - type = "string" - default = "storage" -} - # Global DataDog variable "message" { description = "Message sent when a Redis monitor is triggered" @@ -35,11 +14,16 @@ variable "delay" { default = 600 } -variable "use_filter_tags" { - description = "Filter the data with service tags if true" +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" default = "true" } +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + # Azure Stream Analytics specific variable "su_utilization_threshold_warning" { description = "Streaming Unit utilization rate limit (warning threshold)" @@ -56,7 +40,7 @@ variable "function_requests_threshold_warning" { default = 0 } -variable "function_requests_threshold_critical" { +variable "failed_function_requests_threshold_critical" { description = "Failed Function Request rate limit (critical threshold)" default = 10 } diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 0ecb513..f72af1f 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -2,12 +2,35 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "subscription_id:${var.subscription_id}"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_streamanalytics:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } +resource "datadog_monitor" "status" { + name = "[${var.environment}] Stream Analytics Status is not ok on {{name}}" + message = "${var.message}" + + query = < ${var.su_utilization_threshold_critical} EOF - type = "query alert" + type = "metric alert" - notify_no_data = "${var.notify_no_data}" + notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -32,22 +55,22 @@ resource "datadog_monitor" "su_utilization" { critical = "${var.su_utilization_threshold_critical}" } - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] } resource "datadog_monitor" "failed_function_requests" { - name = "[${var.environment}] Stream Analytics more than ${var.function_requests_threshold_critical} failed function requests on {{name}}" + name = "[${var.environment}] Stream Analytics more than ${var.failed_function_requests_threshold_critical} failed function requests on {{name}}" message = "${var.message}" query = < ${var.function_requests_threshold_critical} + ) * 100 > ${var.failed_function_requests_threshold_critical} EOF - type = "query alert" + type = "metric alert" - notify_no_data = "${var.notify_no_data}" + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 60 notify_audit = false @@ -59,27 +82,26 @@ resource "datadog_monitor" "failed_function_requests" { no_data_timeframe = 20 thresholds { warning = "${var.function_requests_threshold_warning}" - critical = "${var.function_requests_threshold_critical}" + critical = "${var.failed_function_requests_threshold_critical}" } - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] } resource "datadog_monitor" "conversion_errors" { name = "[${var.environment}] Stream Analytics more than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}" - # Hard Coded Message while we don't know how to configure warning and critical thresholds - message = "@FR-CloudPublic-run@fr.clara.net" + message = "${var.message}" query = < ${var.conversion_errors_threshold_critical} EOF - type = "query alert" + type = "metric alert" - notify_no_data = "${var.notify_no_data}" + notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -92,24 +114,23 @@ resource "datadog_monitor" "conversion_errors" { critical = "${var.conversion_errors_threshold_critical}" } - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] } resource "datadog_monitor" "runtime_errors" { name = "[${var.environment}] Stream Analytics more than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}" - # Hard Coded Message while we don't know how to configure warning and critical thresholds - message = "@FR-CloudPublic-run@fr.clara.net" + message = "${var.message}" query = < ${var.runtime_errors_threshold_critical} EOF - type = "query alert" + type = "metric alert" - notify_no_data = "${var.notify_no_data}" + notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -122,5 +143,5 @@ resource "datadog_monitor" "runtime_errors" { critical = "${var.runtime_errors_threshold_critical}" } - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"] }