From 4c474be541eb5f3a3f14bf2a8bd7716803651ecf Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Mon, 30 Oct 2017 16:42:58 +0100 Subject: [PATCH] MON-80 Add monitors and update variables --- cloud/azure/iothubs/inputs.tf | 44 +++++++++++----- cloud/azure/iothubs/monitors-iothubs.tf | 69 ++++++++++++++++++++----- 2 files changed, 86 insertions(+), 27 deletions(-) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 5de7dab..38b1b44 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -1,38 +1,54 @@ -variable "critical_escalation_group" {} - -variable "warning_escalation_group" {} - variable "environment" {} variable "stack" {} variable "client_name" {} -## IOT hubs variable "delay" { default = 600 } -variable "warning_jobs_failed" { - default = 5 +## IOT hubs +variable "jobs_failed_threshold_warning" { + default = 0 } -variable "critical_jobs_failed" { +variable "jobs_failed_threshold_critical" { default = 10 } -variable "warning_listjobs_failed" { - default = 5 +variable "jobs_failed_message" {} + +variable "listjobs_failed_threshold_warning" { + default = 0 } -variable "critical_listjobs_failed" { +variable "listjobs_failed_threshold_critical" { default = 10 } -variable "warning_queryjobs_failed" { - default = 5 +variable "listjobs_failed_message" {} + +variable "queryjobs_failed_threshold_warning" { + default = 0 } -variable "critical_queryjobs_failed" { +variable "queryjobs_failed_threshold_critical" { + default = 10 +} + +variable "queryjobs_failed_message" {} + +variable "status_message" {} + +variable "total_devices_message" {} + +variable "c2d_methods_failed_message" {} + +variable "c2d_methods_failed_threshold_warning" { + default = 0 +} + +variable "c2d_methods_failed_threshold_critical" { default = 10 } \ No newline at end of file diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index e333808..12f3d9a 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -1,13 +1,13 @@ resource "datadog_monitor" "too_many_jobs_failed" { name = "[${var.environment}] Too many jobs failed on {{name}} " - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.jobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.critical_jobs_failed}" + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" type = "query alert" thresholds { - warning = "${var.warning_jobs_failed}" - critical = "${var.critical_jobs_failed}" + warning = "${var.jobs_failed_threshold_warning}" + critical = "${var.jobs_failed_threshold_critical}" } notify_no_data = false @@ -24,14 +24,14 @@ resource "datadog_monitor" "too_many_jobs_failed" { resource "datadog_monitor" "too_many_list_jobs_failed" { name = "[${var.environment}] Too many list_jobs failure on {{name}} " - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.listjobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_listjobs_failed}" + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" type = "query alert" thresholds { - warning = "${var.warning_listjobs_failed}" - critical = "${var.critical_listjobs_failed}" + warning = "${var.listjobs_failed_threshold_warning}" + critical = "${var.listjobs_failed_threshold_critical}" } notify_no_data = false @@ -48,14 +48,14 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { resource "datadog_monitor" "too_many_query_jobs_failed" { name = "[${var.environment}] Too many query_jobs failed on {{name}} " - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.queryjobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_queryjobs_failed}" + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" type = "query alert" thresholds { - warning = "${var.warning_queryjobs_failed}" - critical = "${var.critical_queryjobs_failed}" + warning = "${var.queryjobs_failed_threshold_warning}" + critical = "${var.queryjobs_failed_threshold_warning}" } notify_no_data = false @@ -72,11 +72,54 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { resource "datadog_monitor" "status" { name = "[${var.environment}] Status is not ok on {{name}} " - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.status_message}" query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1" type = "query alert" + notify_no_data = true + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "total_devices" { + name = "[${var.environment}] Total devices is wrong on {{name}} " + message = "${var.total_devices_message}" + + query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{*} by {name,resource_group} == 0" + type = "query alert" + + notify_no_data = true + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_c2d_methods_failed" { + name = "[${var.environment}] Too many c2d methods failure on {{name}} " + message = "${var.c2d_methods_failed_message}" + + query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.c2d_methods_failed_threshold_warning}" + critical = "${var.c2d_methods_failed_threshold_critical}" + } + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 60