From e4e929ec1d6bb380da5eed54bd324ab33fc513ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20NANCEL?= Date: Mon, 30 Oct 2017 12:47:19 +0100 Subject: [PATCH] MON-78 Add datadog monitor for stream analytics --- cloud/azure/stream-analytics/inputs.tf | 44 +++++++++ .../monitors-stream-analytics.tf | 92 +++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 cloud/azure/stream-analytics/inputs.tf create mode 100644 cloud/azure/stream-analytics/monitors-stream-analytics.tf diff --git a/cloud/azure/stream-analytics/inputs.tf b/cloud/azure/stream-analytics/inputs.tf new file mode 100644 index 0000000..e9bc507 --- /dev/null +++ b/cloud/azure/stream-analytics/inputs.tf @@ -0,0 +1,44 @@ +variable "hno_escalation_group" {} +variable "ho_escalation_group" {} + +variable "environment" {} + +variable "notify_no_data" { + default = "false" +} + +variable "delay" { + default = "600" +} + +variable "su_utilization_warning" { + default = 60 +} + +variable "su_utilization_critical" { + default = 80 +} + +variable "failed_function_requests_warning" { + default = 0 +} + +variable "failed_function_requests_critical" { + default = 10 +} + +variable "conversion_errors_warning" { + default = 0 +} + +variable "conversion_errors_critical" { + default = 10 +} + +variable "runtime_errors_warning" { + default = 0 +} + +variable "runtime_errors_critical" { + default = 0 +} diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf new file mode 100644 index 0000000..f18d7f1 --- /dev/null +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -0,0 +1,92 @@ +resource "datadog_monitor" "SU_utilization" { + name = "[${var.environment} SU utilization at more than ${var.su_utilization_critical}% on {{host.identifier}}]" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.resource_utilization{*} by {name,resource_group} > ${var.su_utilization_critical}" + type = "query alert" + + notify_no_data = "${var.notify_no_data}" + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.su_utilization_warning}" + critical = "${var.su_utilization_critical}" + } +} + +resource "datadog_monitor" "failed_function_requests" { + name = "[${var.environment} More than ${var.failed_function_requests_critical} failed function requests on {{host.identifier}}]" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests{*} by {name,resource_group} > ${var.failed_function_requests_critical}" + type = "query alert" + + notify_no_data = "${var.notify_no_data}" + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.failed_function_requests_warning}" + critical = "${var.failed_function_requests_critical}" + } +} + +resource "datadog_monitor" "conversion_errors" { + name = "[${var.environment} More than ${var.conversion_errors_critical} conversion errors on {{host.identifier}}]" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.conversion_errors{*} by {name,resource_group} > ${var.conversion_errors_critical}" + type = "query alert" + + notify_no_data = "${var.notify_no_data}" + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.conversion_errors_warning}" + critical = "${var.conversion_errors_critical}" + } +} + +resource "datadog_monitor" "runtime_errors" { + name = "[${var.environment} More than ${var.runtime_errors_critical} runtime errors on {{host.identifier}}]" + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.streamanalytics_streamingjobs.errors{*} by {name,resource_group} > ${var.runtime_errors_critical}" + type = "query alert" + + notify_no_data = "${var.notify_no_data}" + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + thresholds { + warning = "${var.runtime_errors_warning}" + critical = "${var.runtime_errors_critical}" + } +} +