From daabb7244af225ccffc2580fbb2b441586163bba Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Mon, 30 Oct 2017 14:30:05 +0100 Subject: [PATCH 01/14] MON-80 Add inputs and monitors files --- cloud/azure/iothubs/inputs.tf | 36 ++++++++++ cloud/azure/iothubs/monitors-iothubs.tf | 90 +++++++++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 cloud/azure/iothubs/inputs.tf create mode 100644 cloud/azure/iothubs/monitors-iothubs.tf diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf new file mode 100644 index 0000000..ddc3456 --- /dev/null +++ b/cloud/azure/iothubs/inputs.tf @@ -0,0 +1,36 @@ +variable "hno_escalation_group" {} + +variable "ho_escalation_group" {} + +variable "environment" {} + +variable "subscription_id" {} + +## IOT hubs +variable "delay" { + default = 600 +} + +variable "warning_jobs_failed" { + default = 5 +} + +variable "critical_jobs_failed" { + default = 10 +} + +variable "warning_listjobs_failed" { + default = 5 +} + +variable "critical_listjobs_failed" { + default = 10 +} + +variable "warning_queryjobs_failed" { + default = 5 +} + +variable "critical_queryjobs_failed" { + default = 10 +} \ No newline at end of file diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf new file mode 100644 index 0000000..5f584db --- /dev/null +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -0,0 +1,90 @@ +resource "datadog_monitor" "too_many_jobs_failed" { + name = "[${var.environment}] Too many jobs failed on {{name}} " + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.critical_jobs_failed}" + type = "query alert" + + thresholds { + warning = "${var.warning_jobs_failed}" + critical = "${var.critical_jobs_failed}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_list_jobs_failed" { + name = "[${var.environment}] Too many list_jobs failure on {{name}} " + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_listjobs_failed}" + type = "query alert" + + thresholds { + warning = "${var.warning_listjobs_failed}" + critical = "${var.critical_listjobs_failed}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_query_jobs_failed" { + name = "[${var.environment}] Too many query_jobs failed on {{name}} " + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_queryjobs_failed}" + type = "query alert" + + thresholds { + warning = "${var.warning_queryjobs_failed}" + critical = "${var.critical_queryjobs_failed}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "status" { + name = "[${var.environment}] Status is not ok on {{name}} " + message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + + query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1" + type = "query alert" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} \ No newline at end of file From 7f0a0e91cf6fdd3cb6ea5d33abcbf2dbdb41c0a2 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Mon, 30 Oct 2017 15:21:40 +0100 Subject: [PATCH 02/14] MON-80 Rename variable for message alerting --- cloud/azure/iothubs/inputs.tf | 8 +++++--- cloud/azure/iothubs/monitors-iothubs.tf | 8 ++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index ddc3456..5de7dab 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -1,10 +1,12 @@ -variable "hno_escalation_group" {} +variable "critical_escalation_group" {} -variable "ho_escalation_group" {} +variable "warning_escalation_group" {} variable "environment" {} -variable "subscription_id" {} +variable "stack" {} + +variable "client_name" {} ## IOT hubs variable "delay" { diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 5f584db..e333808 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -1,6 +1,6 @@ resource "datadog_monitor" "too_many_jobs_failed" { name = "[${var.environment}] Too many jobs failed on {{name}} " - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.critical_jobs_failed}" type = "query alert" @@ -24,7 +24,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { resource "datadog_monitor" "too_many_list_jobs_failed" { name = "[${var.environment}] Too many list_jobs failure on {{name}} " - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_listjobs_failed}" type = "query alert" @@ -48,7 +48,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { resource "datadog_monitor" "too_many_query_jobs_failed" { name = "[${var.environment}] Too many query_jobs failed on {{name}} " - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_queryjobs_failed}" type = "query alert" @@ -72,7 +72,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { resource "datadog_monitor" "status" { name = "[${var.environment}] Status is not ok on {{name}} " - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" + message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1" type = "query alert" From 4c474be541eb5f3a3f14bf2a8bd7716803651ecf Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Mon, 30 Oct 2017 16:42:58 +0100 Subject: [PATCH 03/14] MON-80 Add monitors and update variables --- cloud/azure/iothubs/inputs.tf | 44 +++++++++++----- cloud/azure/iothubs/monitors-iothubs.tf | 69 ++++++++++++++++++++----- 2 files changed, 86 insertions(+), 27 deletions(-) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 5de7dab..38b1b44 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -1,38 +1,54 @@ -variable "critical_escalation_group" {} - -variable "warning_escalation_group" {} - variable "environment" {} variable "stack" {} variable "client_name" {} -## IOT hubs variable "delay" { default = 600 } -variable "warning_jobs_failed" { - default = 5 +## IOT hubs +variable "jobs_failed_threshold_warning" { + default = 0 } -variable "critical_jobs_failed" { +variable "jobs_failed_threshold_critical" { default = 10 } -variable "warning_listjobs_failed" { - default = 5 +variable "jobs_failed_message" {} + +variable "listjobs_failed_threshold_warning" { + default = 0 } -variable "critical_listjobs_failed" { +variable "listjobs_failed_threshold_critical" { default = 10 } -variable "warning_queryjobs_failed" { - default = 5 +variable "listjobs_failed_message" {} + +variable "queryjobs_failed_threshold_warning" { + default = 0 } -variable "critical_queryjobs_failed" { +variable "queryjobs_failed_threshold_critical" { + default = 10 +} + +variable "queryjobs_failed_message" {} + +variable "status_message" {} + +variable "total_devices_message" {} + +variable "c2d_methods_failed_message" {} + +variable "c2d_methods_failed_threshold_warning" { + default = 0 +} + +variable "c2d_methods_failed_threshold_critical" { default = 10 } \ No newline at end of file diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index e333808..12f3d9a 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -1,13 +1,13 @@ resource "datadog_monitor" "too_many_jobs_failed" { name = "[${var.environment}] Too many jobs failed on {{name}} " - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.jobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.critical_jobs_failed}" + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" type = "query alert" thresholds { - warning = "${var.warning_jobs_failed}" - critical = "${var.critical_jobs_failed}" + warning = "${var.jobs_failed_threshold_warning}" + critical = "${var.jobs_failed_threshold_critical}" } notify_no_data = false @@ -24,14 +24,14 @@ resource "datadog_monitor" "too_many_jobs_failed" { resource "datadog_monitor" "too_many_list_jobs_failed" { name = "[${var.environment}] Too many list_jobs failure on {{name}} " - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.listjobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_listjobs_failed}" + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" type = "query alert" thresholds { - warning = "${var.warning_listjobs_failed}" - critical = "${var.critical_listjobs_failed}" + warning = "${var.listjobs_failed_threshold_warning}" + critical = "${var.listjobs_failed_threshold_critical}" } notify_no_data = false @@ -48,14 +48,14 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { resource "datadog_monitor" "too_many_query_jobs_failed" { name = "[${var.environment}] Too many query_jobs failed on {{name}} " - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.queryjobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.critical_queryjobs_failed}" + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" type = "query alert" thresholds { - warning = "${var.warning_queryjobs_failed}" - critical = "${var.critical_queryjobs_failed}" + warning = "${var.queryjobs_failed_threshold_warning}" + critical = "${var.queryjobs_failed_threshold_warning}" } notify_no_data = false @@ -72,11 +72,54 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { resource "datadog_monitor" "status" { name = "[${var.environment}] Status is not ok on {{name}} " - message = "{{#is_alert}}\n${var.critical_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.critical_escalation_group}\n{{/is_recovery}}" + message = "${var.status_message}" query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1" type = "query alert" + notify_no_data = true + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "total_devices" { + name = "[${var.environment}] Total devices is wrong on {{name}} " + message = "${var.total_devices_message}" + + query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{*} by {name,resource_group} == 0" + type = "query alert" + + notify_no_data = true + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_c2d_methods_failed" { + name = "[${var.environment}] Too many c2d methods failure on {{name}} " + message = "${var.c2d_methods_failed_message}" + + query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.c2d_methods_failed_threshold_warning}" + critical = "${var.c2d_methods_failed_threshold_critical}" + } + notify_no_data = false evaluation_delay = "${var.delay}" renotify_interval = 60 From effaaf0e12d6446510700e4bc71765c8a0b37441 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Mon, 30 Oct 2017 17:13:42 +0100 Subject: [PATCH 04/14] MON-80 Add c2d and d2c monitors --- cloud/azure/iothubs/inputs.tf | 46 +++++++++++- cloud/azure/iothubs/monitors-iothubs.tf | 98 ++++++++++++++++++++++++- 2 files changed, 140 insertions(+), 4 deletions(-) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 38b1b44..093b3a3 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -43,12 +43,52 @@ variable "status_message" {} variable "total_devices_message" {} -variable "c2d_methods_failed_message" {} - variable "c2d_methods_failed_threshold_warning" { default = 0 } variable "c2d_methods_failed_threshold_critical" { default = 10 -} \ No newline at end of file +} + +variable "c2d_methods_failed_message" {} + +variable "c2d_twin_read_failed_threshold_warning" { + default = 0 +} + +variable "c2d_twin_read_failed_threshold_critical" { + default = 10 +} + +variable "c2d_twin_read_failed_message" {} + +variable "c2d_twin_update_failed_threshold_warning" { + default = 0 +} + +variable "c2d_twin_update_failed_threshold_critical" { + default = 10 +} + +variable "c2d_twin_update_failed_message" {} + +variable "d2c_twin_read_failed_threshold_warning" { + default = 0 +} + +variable "d2c_twin_read_failed_threshold_critical" { + default = 10 +} + +variable "d2c_twin_read_failed_message" {} + +variable "d2c_twin_update_failed_threshold_warning" { + default = 0 +} + +variable "d2c_twin_update_failed_threshold_critical" { + default = 10 +} + +variable "d2c_twin_update_failed_message" {} \ No newline at end of file diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 12f3d9a..8d44dde 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -130,4 +130,100 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 -} \ No newline at end of file +} + +resource "datadog_monitor" "too_many_c2d_twin_read_failed" { + name = "[${var.environment}] Too many c2d twin read failure on {{name}} " + message = "${var.c2d_twin_read_failed_message}" + + query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.c2d_twin_read_failed_threshold_warning}" + critical = "${var.c2d_twin_read_failed_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_c2d_twin_update_failed" { + name = "[${var.environment}] Too many c2d twin update failure on {{name}} " + message = "${var.c2d_twin_update_failed_message}" + + query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.c2d_twin_update_failed_threshold_warning}" + critical = "${var.c2d_twin_update_failed_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_d2c_twin_read_failed" { + name = "[${var.environment}] Too many d2c twin read failure on {{name}} " + message = "${var.d2c_twin_read_failed_message}" + + query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.d2c_twin_read_failed_threshold_warning}" + critical = "${var.d2c_twin_read_failed_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_d2c_twin_update_failed" { + name = "[${var.environment}] Too many d2c twin update failure on {{name}} " + message = "${var.d2c_twin_update_failed_message}" + + query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.d2c_twin_update_failed_threshold_warning}" + critical = "${var.d2c_twin_update_failed_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 60 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} From 5136dd5c4d1e5b3bea7685b95f8361d21a02dc34 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Mon, 30 Oct 2017 17:30:16 +0100 Subject: [PATCH 05/14] MON-80 Add subscription_id --- cloud/azure/iothubs/inputs.tf | 2 ++ cloud/azure/iothubs/monitors-iothubs.tf | 40 ++++++++++++------------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 093b3a3..e705d8f 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -4,6 +4,8 @@ variable "stack" {} variable "client_name" {} +variable "subscription_id" {} + variable "delay" { default = 600 } diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 8d44dde..a4ec018 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -2,7 +2,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { name = "[${var.environment}] Too many jobs failed on {{name}} " message = "${var.jobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" type = "query alert" thresholds { @@ -12,7 +12,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -26,7 +26,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { name = "[${var.environment}] Too many list_jobs failure on {{name}} " message = "${var.listjobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" type = "query alert" thresholds { @@ -36,7 +36,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -50,7 +50,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { name = "[${var.environment}] Too many query_jobs failed on {{name}} " message = "${var.queryjobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{*} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{*} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" + query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" type = "query alert" thresholds { @@ -60,7 +60,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -74,12 +74,12 @@ resource "datadog_monitor" "status" { name = "[${var.environment}] Status is not ok on {{name}} " message = "${var.status_message}" - query = "avg(last_5m):avg:azure.devices_iothubs.status{*} by {name,resource_group} < 1" + query = "avg(last_5m):avg:azure.devices_iothubs.status{subscription_id:${var.subscription_id}} by {name,resource_group} < 1" type = "query alert" notify_no_data = true evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -93,12 +93,12 @@ resource "datadog_monitor" "total_devices" { name = "[${var.environment}] Total devices is wrong on {{name}} " message = "${var.total_devices_message}" - query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{*} by {name,resource_group} == 0" + query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{subscription_id:${var.subscription_id}} by {name,resource_group} == 0" type = "query alert" notify_no_data = true evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -112,7 +112,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { name = "[${var.environment}] Too many c2d methods failure on {{name}} " message = "${var.c2d_methods_failed_message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}" type = "query alert" thresholds { @@ -122,7 +122,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -136,7 +136,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { name = "[${var.environment}] Too many c2d twin read failure on {{name}} " message = "${var.c2d_twin_read_failed_message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}" type = "query alert" thresholds { @@ -146,7 +146,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -160,7 +160,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { name = "[${var.environment}] Too many c2d twin update failure on {{name}} " message = "${var.c2d_twin_update_failed_message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}" type = "query alert" thresholds { @@ -170,7 +170,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -184,7 +184,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { name = "[${var.environment}] Too many d2c twin read failure on {{name}} " message = "${var.d2c_twin_read_failed_message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}" type = "query alert" thresholds { @@ -194,7 +194,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true @@ -208,7 +208,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { name = "[${var.environment}] Too many d2c twin update failure on {{name}} " message = "${var.d2c_twin_update_failed_message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{*} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{*} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{*} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}" type = "query alert" thresholds { @@ -218,7 +218,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { notify_no_data = false evaluation_delay = "${var.delay}" - renotify_interval = 60 + renotify_interval = 0 notify_audit = false timeout_h = 0 include_tags = true From 193352c212277fac91d51ad336b704f7cde8d54c Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Mon, 30 Oct 2017 18:09:03 +0100 Subject: [PATCH 06/14] MON-80 Add IOT Hub in Names --- cloud/azure/iothubs/monitors-iothubs.tf | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index a4ec018..f111897 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -1,5 +1,5 @@ resource "datadog_monitor" "too_many_jobs_failed" { - name = "[${var.environment}] Too many jobs failed on {{name}} " + name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}" message = "${var.jobs_failed_message}" query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" @@ -23,7 +23,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { } resource "datadog_monitor" "too_many_list_jobs_failed" { - name = "[${var.environment}] Too many list_jobs failure on {{name}} " + name = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}" message = "${var.listjobs_failed_message}" query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" @@ -47,7 +47,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { } resource "datadog_monitor" "too_many_query_jobs_failed" { - name = "[${var.environment}] Too many query_jobs failed on {{name}} " + name = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}" message = "${var.queryjobs_failed_message}" query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" @@ -55,7 +55,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { thresholds { warning = "${var.queryjobs_failed_threshold_warning}" - critical = "${var.queryjobs_failed_threshold_warning}" + critical = "${var.queryjobs_failed_threshold_critical}" } notify_no_data = false @@ -71,7 +71,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { } resource "datadog_monitor" "status" { - name = "[${var.environment}] Status is not ok on {{name}} " + name = "[${var.environment}] IOT Hub Status is not ok on {{name}}" message = "${var.status_message}" query = "avg(last_5m):avg:azure.devices_iothubs.status{subscription_id:${var.subscription_id}} by {name,resource_group} < 1" @@ -90,7 +90,7 @@ resource "datadog_monitor" "status" { } resource "datadog_monitor" "total_devices" { - name = "[${var.environment}] Total devices is wrong on {{name}} " + name = "[${var.environment}] IOT Hub Total devices is wrong on {{name}}" message = "${var.total_devices_message}" query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{subscription_id:${var.subscription_id}} by {name,resource_group} == 0" @@ -109,7 +109,7 @@ resource "datadog_monitor" "total_devices" { } resource "datadog_monitor" "too_many_c2d_methods_failed" { - name = "[${var.environment}] Too many c2d methods failure on {{name}} " + name = "[${var.environment}] IOT Hub Too many c2d methods failure on {{name}}" message = "${var.c2d_methods_failed_message}" query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}" @@ -133,7 +133,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { } resource "datadog_monitor" "too_many_c2d_twin_read_failed" { - name = "[${var.environment}] Too many c2d twin read failure on {{name}} " + name = "[${var.environment}] IOT Hub Too many c2d twin read failure on {{name}}" message = "${var.c2d_twin_read_failed_message}" query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}" @@ -157,7 +157,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { } resource "datadog_monitor" "too_many_c2d_twin_update_failed" { - name = "[${var.environment}] Too many c2d twin update failure on {{name}} " + name = "[${var.environment}] IOT Hub Too many c2d twin update failure on {{name}}" message = "${var.c2d_twin_update_failed_message}" query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}" @@ -181,7 +181,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { } resource "datadog_monitor" "too_many_d2c_twin_read_failed" { - name = "[${var.environment}] Too many d2c twin read failure on {{name}} " + name = "[${var.environment}] IOT Hub Too many d2c twin read failure on {{name}}" message = "${var.d2c_twin_read_failed_message}" query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}" @@ -205,7 +205,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { } resource "datadog_monitor" "too_many_d2c_twin_update_failed" { - name = "[${var.environment}] Too many d2c twin update failure on {{name}} " + name = "[${var.environment}] IOT Hub Too many d2c twin update failure on {{name}}" message = "${var.d2c_twin_update_failed_message}" query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}" From 113d4aabd25fa1172dea821ef6b6f7688011f960 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 31 Oct 2017 11:12:26 +0100 Subject: [PATCH 07/14] MON-80 Add monitors for telemetry --- cloud/azure/iothubs/inputs.tf | 44 ++++++++- cloud/azure/iothubs/monitors-iothubs.tf | 121 +++++++++++++++++++++++- 2 files changed, 161 insertions(+), 4 deletions(-) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index e705d8f..5ae0587 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -93,4 +93,46 @@ variable "d2c_twin_update_failed_threshold_critical" { default = 10 } -variable "d2c_twin_update_failed_message" {} \ No newline at end of file +variable "d2c_twin_update_failed_message" {} + +variable "d2c_telemetry_egress_dropped_threshold_warning" { + default = 500 +} + +variable "d2c_telemetry_egress_dropped_threshold_critical" { + default = 1000 +} + +variable "d2c_telemetry_egress_dropped_message" {} + +variable "d2c_telemetry_egress_orphaned_threshold_warning" { + default = 500 +} + +variable "d2c_telemetry_egress_orphaned_threshold_critical" { + default = 1000 +} + +variable "d2c_telemetry_egress_orphaned_message" {} + +variable "d2c_telemetry_egress_invalid_threshold_warning" { + default = 500 +} + +variable "d2c_telemetry_egress_invalid_threshold_critical" { + default = 1000 +} + +variable "d2c_telemetry_egress_invalid_message" {} + +variable "d2c_telemetry_egress_fallback_threshold_warning" { + default = 500 +} + +variable "d2c_telemetry_egress_fallback_threshold_critical" { + default = 1000 +} + +variable "d2c_telemetry_egress_fallback_message" {} + +variable "d2c_telemetry_ingress_nosent_message" {} diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index f111897..4c59099 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -2,7 +2,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}" message = "${var.jobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" type = "query alert" thresholds { @@ -26,7 +26,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { name = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}" message = "${var.listjobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" type = "query alert" thresholds { @@ -50,7 +50,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { name = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}" message = "${var.queryjobs_failed_message}" - query = "sum(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" + query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" type = "query alert" thresholds { @@ -227,3 +227,118 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 } + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped on {{name}}" + message = "${var.d2c_telemetry_egress_dropped_message}" + + query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_dropped_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.d2c_telemetry_egress_dropped_threshold_warning}" + critical = "${var.d2c_telemetry_egress_dropped_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned on {{name}}" + message = "${var.d2c_telemetry_egress_orphaned_message}" + + query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_orphaned_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.d2c_telemetry_egress_orphaned_threshold_warning}" + critical = "${var.d2c_telemetry_egress_orphaned_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid on {{name}}" + message = "${var.d2c_telemetry_egress_invalid_message}" + + query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_invalid_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.d2c_telemetry_egress_invalid_threshold_warning}" + critical = "${var.d2c_telemetry_egress_invalid_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress fallback on {{name}}" + message = "${var.d2c_telemetry_egress_fallback_message}" + + query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_fallback_threshold_critical}" + type = "query alert" + + thresholds { + warning = "${var.d2c_telemetry_egress_fallback_threshold_warning}" + critical = "${var.d2c_telemetry_egress_fallback_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} + +resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress no sent on {{name}}" + message = "${var.d2c_telemetry_ingress_nosent_message}" + + query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() - avg:azure.devices_iothubs.d2c.telemetry.ingress.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() > 0" + type = "query alert" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 +} From cf3309ce753a146901fce7d1bcb4871639f8d410 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 31 Oct 2017 11:47:37 +0100 Subject: [PATCH 08/14] MON-80 Add README.md --- cloud/azure/iothubs/README.md | 109 ++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 cloud/azure/iothubs/README.md diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md new file mode 100644 index 0000000..d53bf2b --- /dev/null +++ b/cloud/azure/iothubs/README.md @@ -0,0 +1,109 @@ +Azure Redis DataDog monitors +============================ + +How to use this module +---------------------- + +``` +module "iothubs" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors" + + jobs_failed_message = "${module.datadog-message-alerting.alerting-message}" + listjobs_failed_message = "${module.datadog-message-alerting.alerting-message}" + queryjobs_failed_message = "${module.datadog-message-alerting.alerting-message}" + status_message = "${module.datadog-message-alerting.alerting-message}" + total_devices_message = "${module.datadog-message-alerting.alerting-message}" + c2d_methods_failed_message = "${module.datadog-message-alerting.alerting-message}" + c2d_twin_read_failed_message = "${module.datadog-message-alerting.alerting-message}" + c2d_twin_update_failed_message = "${module.datadog-message-alerting.alerting-message}" + d2c_twin_read_failed_message = "${module.datadog-message-alerting.alerting-message}" + d2c_twin_update_failed_message = "${module.datadog-message-alerting.alerting-message}" + d2c_telemetry_egress_dropped_message = "${module.datadog-message-alerting.alerting-message}" + d2c_telemetry_egress_orphaned_message = "${module.datadog-message-alerting.alerting-message}" + d2c_telemetry_egress_invalid_message = "${module.datadog-message-alerting.alerting-message}" + d2c_telemetry_egress_fallback_message = "${module.datadog-message-alerting.alerting-message}" + d2c_telemetry_ingress_nosent_message = "${module.datadog-message-alerting.alerting-message}" + + environment = "${var.environment}" + stack = "${var.stack}" + client_name = "${var.client_name}" + subscription_id = "${var.subscription_id}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* Service status check +* Jobs failed average check +* Query Jobs failed average check +* List Jobs failed average check +* Total devices count check +* C2D methods failed average check +* C2D twin read failed average check +* C2D twin update failed average check +* D2C twin read failed average check +* D2C twin update failed average check +* D2C telemetry egress dropped count check +* D2C telemetry egress orphaned count check +* D2C telemetry egress invalid count check +* D2C telemetry egress fallback count check +* D2C telemetry ingress no sent count check + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| c2d_methods_failed_message | | string | - | yes | +| c2d_methods_failed_threshold_critical | | string | `10` | no | +| c2d_methods_failed_threshold_warning | | string | `0` | no | +| c2d_twin_read_failed_message | | string | - | yes | +| c2d_twin_read_failed_threshold_critical | | string | `10` | no | +| c2d_twin_read_failed_threshold_warning | | string | `0` | no | +| c2d_twin_update_failed_message | | string | - | yes | +| c2d_twin_update_failed_threshold_critical | | string | `10` | no | +| c2d_twin_update_failed_threshold_warning | | string | `0` | no | +| client_name | | string | - | yes | +| d2c_telemetry_egress_dropped_message | | string | - | yes | +| d2c_telemetry_egress_dropped_threshold_critical | | string | `1000` | no | +| d2c_telemetry_egress_dropped_threshold_warning | | string | `500` | no | +| d2c_telemetry_egress_fallback_message | | string | - | yes | +| d2c_telemetry_egress_fallback_threshold_critical | | string | `1000` | no | +| d2c_telemetry_egress_fallback_threshold_warning | | string | `500` | no | +| d2c_telemetry_egress_invalid_message | | string | - | yes | +| d2c_telemetry_egress_invalid_threshold_critical | | string | `1000` | no | +| d2c_telemetry_egress_invalid_threshold_warning | | string | `500` | no | +| d2c_telemetry_egress_orphaned_message | | string | - | yes | +| d2c_telemetry_egress_orphaned_threshold_critical | | string | `1000` | no | +| d2c_telemetry_egress_orphaned_threshold_warning | | string | `500` | no | +| d2c_telemetry_ingress_nosent_message | | string | - | yes | +| d2c_twin_read_failed_message | | string | - | yes | +| d2c_twin_read_failed_threshold_critical | | string | `10` | no | +| d2c_twin_read_failed_threshold_warning | | string | `0` | no | +| d2c_twin_update_failed_message | | string | - | yes | +| d2c_twin_update_failed_threshold_critical | | string | `10` | no | +| d2c_twin_update_failed_threshold_warning | | string | `0` | no | +| delay | | string | `600` | no | +| environment | | string | - | yes | +| jobs_failed_message | | string | - | yes | +| jobs_failed_threshold_critical | | string | `10` | no | +| jobs_failed_threshold_warning | # IOT hubs | string | `0` | no | +| listjobs_failed_message | | string | - | yes | +| listjobs_failed_threshold_critical | | string | `10` | no | +| listjobs_failed_threshold_warning | | string | `0` | no | +| queryjobs_failed_message | | string | - | yes | +| queryjobs_failed_threshold_critical | | string | `10` | no | +| queryjobs_failed_threshold_warning | | string | `0` | no | +| stack | | string | - | yes | +| status_message | | string | - | yes | +| subscription_id | | string | - | yes | +| total_devices_message | | string | - | yes | + +Related documentation +--------------------- + +DataDog documentation: https://docs.datadoghq.com/integrations/azure_iot_hub/ + +Azure IOT Hubs metrics documentation: https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health \ No newline at end of file From c1563c331898b4ca8b2b08792e27d35e94affed2 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 31 Oct 2017 14:25:24 +0100 Subject: [PATCH 09/14] MON-80 use only one message and add inputs descriptions --- cloud/azure/iothubs/README.md | 97 ++++++----------- cloud/azure/iothubs/inputs.tf | 76 +++++++------- cloud/azure/iothubs/monitors-iothubs.tf | 133 ++++++++++++++++++------ 3 files changed, 178 insertions(+), 128 deletions(-) diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md index d53bf2b..3d6bb91 100644 --- a/cloud/azure/iothubs/README.md +++ b/cloud/azure/iothubs/README.md @@ -1,4 +1,4 @@ -Azure Redis DataDog monitors +Azure IOT Hubs DataDog monitors ============================ How to use this module @@ -8,22 +8,8 @@ How to use this module module "iothubs" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors" - jobs_failed_message = "${module.datadog-message-alerting.alerting-message}" - listjobs_failed_message = "${module.datadog-message-alerting.alerting-message}" - queryjobs_failed_message = "${module.datadog-message-alerting.alerting-message}" - status_message = "${module.datadog-message-alerting.alerting-message}" - total_devices_message = "${module.datadog-message-alerting.alerting-message}" - c2d_methods_failed_message = "${module.datadog-message-alerting.alerting-message}" - c2d_twin_read_failed_message = "${module.datadog-message-alerting.alerting-message}" - c2d_twin_update_failed_message = "${module.datadog-message-alerting.alerting-message}" - d2c_twin_read_failed_message = "${module.datadog-message-alerting.alerting-message}" - d2c_twin_update_failed_message = "${module.datadog-message-alerting.alerting-message}" - d2c_telemetry_egress_dropped_message = "${module.datadog-message-alerting.alerting-message}" - d2c_telemetry_egress_orphaned_message = "${module.datadog-message-alerting.alerting-message}" - d2c_telemetry_egress_invalid_message = "${module.datadog-message-alerting.alerting-message}" - d2c_telemetry_egress_fallback_message = "${module.datadog-message-alerting.alerting-message}" - d2c_telemetry_ingress_nosent_message = "${module.datadog-message-alerting.alerting-message}" - + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" stack = "${var.stack}" client_name = "${var.client_name}" @@ -56,54 +42,39 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| c2d_methods_failed_message | | string | - | yes | -| c2d_methods_failed_threshold_critical | | string | `10` | no | -| c2d_methods_failed_threshold_warning | | string | `0` | no | -| c2d_twin_read_failed_message | | string | - | yes | -| c2d_twin_read_failed_threshold_critical | | string | `10` | no | -| c2d_twin_read_failed_threshold_warning | | string | `0` | no | -| c2d_twin_update_failed_message | | string | - | yes | -| c2d_twin_update_failed_threshold_critical | | string | `10` | no | -| c2d_twin_update_failed_threshold_warning | | string | `0` | no | -| client_name | | string | - | yes | -| d2c_telemetry_egress_dropped_message | | string | - | yes | -| d2c_telemetry_egress_dropped_threshold_critical | | string | `1000` | no | -| d2c_telemetry_egress_dropped_threshold_warning | | string | `500` | no | -| d2c_telemetry_egress_fallback_message | | string | - | yes | -| d2c_telemetry_egress_fallback_threshold_critical | | string | `1000` | no | -| d2c_telemetry_egress_fallback_threshold_warning | | string | `500` | no | -| d2c_telemetry_egress_invalid_message | | string | - | yes | -| d2c_telemetry_egress_invalid_threshold_critical | | string | `1000` | no | -| d2c_telemetry_egress_invalid_threshold_warning | | string | `500` | no | -| d2c_telemetry_egress_orphaned_message | | string | - | yes | -| d2c_telemetry_egress_orphaned_threshold_critical | | string | `1000` | no | -| d2c_telemetry_egress_orphaned_threshold_warning | | string | `500` | no | -| d2c_telemetry_ingress_nosent_message | | string | - | yes | -| d2c_twin_read_failed_message | | string | - | yes | -| d2c_twin_read_failed_threshold_critical | | string | `10` | no | -| d2c_twin_read_failed_threshold_warning | | string | `0` | no | -| d2c_twin_update_failed_message | | string | - | yes | -| d2c_twin_update_failed_threshold_critical | | string | `10` | no | -| d2c_twin_update_failed_threshold_warning | | string | `0` | no | -| delay | | string | `600` | no | -| environment | | string | - | yes | -| jobs_failed_message | | string | - | yes | -| jobs_failed_threshold_critical | | string | `10` | no | -| jobs_failed_threshold_warning | # IOT hubs | string | `0` | no | -| listjobs_failed_message | | string | - | yes | -| listjobs_failed_threshold_critical | | string | `10` | no | -| listjobs_failed_threshold_warning | | string | `0` | no | -| queryjobs_failed_message | | string | - | yes | -| queryjobs_failed_threshold_critical | | string | `10` | no | -| queryjobs_failed_threshold_warning | | string | `0` | no | -| stack | | string | - | yes | -| status_message | | string | - | yes | -| subscription_id | | string | - | yes | -| total_devices_message | | string | - | yes | +| c2d_methods_failed_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no | +| c2d_methods_failed_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no | +| c2d_twin_read_failed_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| c2d_twin_read_failed_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| c2d_twin_update_failed_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| c2d_twin_update_failed_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| client_name | Client Name | string | - | yes | +| d2c_telemetry_egress_dropped_threshold_critical | D2C Telemetry Dropped Failed limit (critical threshold) | string | `1000` | no | +| d2c_telemetry_egress_dropped_threshold_warning | D2C Telemetry Dropped Failed limit (warning threshold) | string | `500` | no | +| d2c_telemetry_egress_fallback_threshold_critical | D2C Telemetry Fallback Failed limit (critical threshold) | string | `1000` | no | +| d2c_telemetry_egress_fallback_threshold_warning | D2C Telemetry Fallback Failed limit (warning threshold) | string | `500` | no | +| d2c_telemetry_egress_invalid_threshold_critical | D2C Telemetry Invalid Failed limit (critical threshold) | string | `1000` | no | +| d2c_telemetry_egress_invalid_threshold_warning | D2C Telemetry Invalid Failed limit (warning threshold) | string | `500` | no | +| d2c_telemetry_egress_orphaned_threshold_critical | D2C Telemetry Orphaned Failed limit (critical threshold) | string | `1000` | no | +| d2c_telemetry_egress_orphaned_threshold_warning | D2C Telemetry Orphaned Failed limit (warning threshold) | string | `500` | no | +| d2c_twin_read_failed_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| d2c_twin_read_failed_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| d2c_twin_update_failed_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| d2c_twin_update_failed_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| environment | Architecture Environment | string | - | yes | +| jobs_failed_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no | +| jobs_failed_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no | +| listjobs_failed_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | +| listjobs_failed_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | +| message | Message sent when an alert is triggered | string | - | yes | +| queryjobs_failed_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | +| queryjobs_failed_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | +| subscription_id | Subscription ID used to tag monitors | string | - | yes | Related documentation --------------------- -DataDog documentation: https://docs.datadoghq.com/integrations/azure_iot_hub/ +DataDog documentation: [https://docs.datadoghq.com/integrations/azure_iot_hub](https://docs.datadoghq.com/integrations/azure_iot_hub) -Azure IOT Hubs metrics documentation: https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health \ No newline at end of file +Azure IOT Hubs metrics documentation: [https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health](https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health) \ No newline at end of file diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 5ae0587..cc591cd 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -1,138 +1,144 @@ -variable "environment" {} +variable "environment" { + description = "Architecture Environment" + type = "string" +} -variable "stack" {} +variable "client_name" { + description = "Client Name" + type = "string" +} -variable "client_name" {} - -variable "subscription_id" {} +variable "subscription_id" { + description = "Subscription ID used to tag monitors" + type = "string" +} variable "delay" { + description = "Delay in seconds for the metric evaluation" default = 600 } +variable "message" { + description = "Message sent when an alert is triggered" +} + ## IOT hubs variable "jobs_failed_threshold_warning" { + description = "Jobs Failed rate limit (warning threshold)" default = 0 } variable "jobs_failed_threshold_critical" { + description = "Jobs Failed rate limit (critical threshold)" default = 10 } -variable "jobs_failed_message" {} - variable "listjobs_failed_threshold_warning" { + description = "ListJobs Failed rate limit (warning threshold)" default = 0 } variable "listjobs_failed_threshold_critical" { + description = "ListJobs Failed rate limit (critical threshold)" default = 10 } -variable "listjobs_failed_message" {} - variable "queryjobs_failed_threshold_warning" { + description = "QueryJobs Failed rate limit (warning threshold)" default = 0 } variable "queryjobs_failed_threshold_critical" { + description = "QueryJobs Failed rate limit (critical threshold)" default = 10 } -variable "queryjobs_failed_message" {} - -variable "status_message" {} - -variable "total_devices_message" {} - variable "c2d_methods_failed_threshold_warning" { + description = "C2D Methods Failed rate limit (warning threshold)" default = 0 } variable "c2d_methods_failed_threshold_critical" { + description = "C2D Methods Failed rate limit (critical threshold)" default = 10 } -variable "c2d_methods_failed_message" {} - variable "c2d_twin_read_failed_threshold_warning" { + description = "C2D Twin Read Failed rate limit (warning threshold)" default = 0 } variable "c2d_twin_read_failed_threshold_critical" { + description = "C2D Twin Read Failed rate limit (critical threshold)" default = 10 } -variable "c2d_twin_read_failed_message" {} - variable "c2d_twin_update_failed_threshold_warning" { + description = "C2D Twin Update Failed rate limit (warning threshold)" default = 0 } variable "c2d_twin_update_failed_threshold_critical" { + description = "C2D Twin Update Failed rate limit (critical threshold)" default = 10 } -variable "c2d_twin_update_failed_message" {} - variable "d2c_twin_read_failed_threshold_warning" { + description = "D2C Twin Read Failed rate limit (warning threshold)" default = 0 } variable "d2c_twin_read_failed_threshold_critical" { + description = "D2C Twin Read Failed rate limit (critical threshold)" default = 10 } -variable "d2c_twin_read_failed_message" {} - variable "d2c_twin_update_failed_threshold_warning" { + description = "D2C Twin Update Failed rate limit (warning threshold)" default = 0 } variable "d2c_twin_update_failed_threshold_critical" { + description = "D2C Twin Update Failed rate limit (critical threshold)" default = 10 } -variable "d2c_twin_update_failed_message" {} - variable "d2c_telemetry_egress_dropped_threshold_warning" { + description = "D2C Telemetry Dropped Failed limit (warning threshold)" default = 500 } variable "d2c_telemetry_egress_dropped_threshold_critical" { + description = "D2C Telemetry Dropped Failed limit (critical threshold)" default = 1000 } -variable "d2c_telemetry_egress_dropped_message" {} - variable "d2c_telemetry_egress_orphaned_threshold_warning" { + description = "D2C Telemetry Orphaned Failed limit (warning threshold)" default = 500 } variable "d2c_telemetry_egress_orphaned_threshold_critical" { + description = "D2C Telemetry Orphaned Failed limit (critical threshold)" default = 1000 } -variable "d2c_telemetry_egress_orphaned_message" {} - variable "d2c_telemetry_egress_invalid_threshold_warning" { + description = "D2C Telemetry Invalid Failed limit (warning threshold)" default = 500 } variable "d2c_telemetry_egress_invalid_threshold_critical" { + description = "D2C Telemetry Invalid Failed limit (critical threshold)" default = 1000 } -variable "d2c_telemetry_egress_invalid_message" {} - variable "d2c_telemetry_egress_fallback_threshold_warning" { + description = "D2C Telemetry Fallback Failed limit (warning threshold)" default = 500 } variable "d2c_telemetry_egress_fallback_threshold_critical" { + description = "D2C Telemetry Fallback Failed limit (critical threshold)" default = 1000 } - -variable "d2c_telemetry_egress_fallback_message" {} - -variable "d2c_telemetry_ingress_nosent_message" {} diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 4c59099..f4a7073 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -1,8 +1,14 @@ resource "datadog_monitor" "too_many_jobs_failed" { name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}" - message = "${var.jobs_failed_message}" + message = "${var.message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" + query = < ${var.jobs_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -24,9 +30,15 @@ resource "datadog_monitor" "too_many_jobs_failed" { resource "datadog_monitor" "too_many_list_jobs_failed" { name = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}" - message = "${var.listjobs_failed_message}" + message = "${var.message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" + query = < ${var.listjobs_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -48,9 +60,15 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { resource "datadog_monitor" "too_many_query_jobs_failed" { name = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}" - message = "${var.queryjobs_failed_message}" + message = "${var.message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" + query = < ${var.queryjobs_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -72,9 +90,11 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { resource "datadog_monitor" "status" { name = "[${var.environment}] IOT Hub Status is not ok on {{name}}" - message = "${var.status_message}" + message = "${var.message}" - query = "avg(last_5m):avg:azure.devices_iothubs.status{subscription_id:${var.subscription_id}} by {name,resource_group} < 1" + query = < ${var.c2d_methods_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -134,9 +162,15 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { resource "datadog_monitor" "too_many_c2d_twin_read_failed" { name = "[${var.environment}] IOT Hub Too many c2d twin read failure on {{name}}" - message = "${var.c2d_twin_read_failed_message}" + message = "${var.message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}" + query = < ${var.c2d_twin_read_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -158,9 +192,15 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { resource "datadog_monitor" "too_many_c2d_twin_update_failed" { name = "[${var.environment}] IOT Hub Too many c2d twin update failure on {{name}}" - message = "${var.c2d_twin_update_failed_message}" + message = "${var.message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}" + query = < ${var.c2d_twin_update_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -182,9 +222,15 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { resource "datadog_monitor" "too_many_d2c_twin_read_failed" { name = "[${var.environment}] IOT Hub Too many d2c twin read failure on {{name}}" - message = "${var.d2c_twin_read_failed_message}" + message = "${var.message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}" + query = < ${var.d2c_twin_read_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -206,9 +252,15 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { resource "datadog_monitor" "too_many_d2c_twin_update_failed" { name = "[${var.environment}] IOT Hub Too many d2c twin update failure on {{name}}" - message = "${var.d2c_twin_update_failed_message}" + message = "${var.message}" - query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}" + query = < ${var.d2c_twin_update_failed_threshold_critical} + EOF type = "query alert" thresholds { @@ -230,9 +282,13 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped on {{name}}" - message = "${var.d2c_telemetry_egress_dropped_message}" + message = "${var.message}" - query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_dropped_threshold_critical}" + query = < ${var.d2c_telemetry_egress_dropped_threshold_critical} + EOF type = "query alert" thresholds { @@ -254,9 +310,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned on {{name}}" - message = "${var.d2c_telemetry_egress_orphaned_message}" + message = "${var.message}" - query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_orphaned_threshold_critical}" + query = < ${var.d2c_telemetry_egress_orphaned_threshold_critical} + EOF type = "query alert" thresholds { @@ -278,9 +338,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid on {{name}}" - message = "${var.d2c_telemetry_egress_invalid_message}" + message = "${var.message}" - query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_invalid_threshold_critical}" + query = < ${var.d2c_telemetry_egress_invalid_threshold_critical} + EOF type = "query alert" thresholds { @@ -302,9 +366,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { name = "[${var.environment}] IOT Hub Too many d2c telemetry egress fallback on {{name}}" - message = "${var.d2c_telemetry_egress_fallback_message}" + message = "${var.message}" - query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_fallback_threshold_critical}" + query = < ${var.d2c_telemetry_egress_fallback_threshold_critical} + EOF type = "query alert" thresholds { @@ -326,9 +394,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress no sent on {{name}}" - message = "${var.d2c_telemetry_ingress_nosent_message}" + message = "${var.message}" - query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() - avg:azure.devices_iothubs.d2c.telemetry.ingress.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() > 0" + query = < 0 + EOF type = "query alert" notify_no_data = false From 9186c6915042ad8f969e05e94ce5db7a5a6fc188 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Tue, 31 Oct 2017 15:37:13 +0100 Subject: [PATCH 10/14] MON-80 Now support use_filter_tags --- cloud/azure/iothubs/inputs.tf | 5 +++++ cloud/azure/iothubs/monitors-iothubs.tf | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index cc591cd..d04d03b 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -8,6 +8,11 @@ variable "client_name" { type = "string" } +variable "use_filter_tags" { + description = "Filter the data with service tags if true" + default = "true" +} + variable "subscription_id" { description = "Subscription ID used to tag monitors" type = "string" diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index f4a7073..1ee29a3 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -1,3 +1,11 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_appservices:enabled,subscription_id:%s,env:%s", var.subscription_id,var.environment) : var.subscription_id}" + } +} + resource "datadog_monitor" "too_many_jobs_failed" { name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}" message = "${var.message}" From 4f2d9bd6943231e0f980e02dad53ae85098c795b Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 3 Nov 2017 20:35:35 +0100 Subject: [PATCH 11/14] MON-80 add tags --- cloud/azure/iothubs/README.md | 6 +- cloud/azure/iothubs/inputs.tf | 34 +++++---- cloud/azure/iothubs/monitors-iothubs.tf | 96 ++++++++++++++++--------- 3 files changed, 85 insertions(+), 51 deletions(-) diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md index 3d6bb91..a0e4be5 100644 --- a/cloud/azure/iothubs/README.md +++ b/cloud/azure/iothubs/README.md @@ -9,10 +9,7 @@ module "iothubs" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors" message = "${module.datadog-message-alerting.alerting-message}" - environment = "${var.environment}" - stack = "${var.stack}" - client_name = "${var.client_name}" subscription_id = "${var.subscription_id}" } ``` @@ -48,7 +45,6 @@ Inputs | c2d_twin_read_failed_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | | c2d_twin_update_failed_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | | c2d_twin_update_failed_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | -| client_name | Client Name | string | - | yes | | d2c_telemetry_egress_dropped_threshold_critical | D2C Telemetry Dropped Failed limit (critical threshold) | string | `1000` | no | | d2c_telemetry_egress_dropped_threshold_warning | D2C Telemetry Dropped Failed limit (warning threshold) | string | `500` | no | | d2c_telemetry_egress_fallback_threshold_critical | D2C Telemetry Fallback Failed limit (critical threshold) | string | `1000` | no | @@ -77,4 +73,4 @@ Related documentation DataDog documentation: [https://docs.datadoghq.com/integrations/azure_iot_hub](https://docs.datadoghq.com/integrations/azure_iot_hub) -Azure IOT Hubs metrics documentation: [https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health](https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health) \ No newline at end of file +Azure IOT Hubs metrics documentation: [https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health](https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index d04d03b..1efabc3 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -1,23 +1,26 @@ +# Global Terraform variable "environment" { description = "Architecture Environment" type = "string" } -variable "client_name" { - description = "Client Name" - type = "string" -} - -variable "use_filter_tags" { - description = "Filter the data with service tags if true" - default = "true" -} - variable "subscription_id" { - description = "Subscription ID used to tag monitors" - type = "string" + description = "Azure account id used as filter for monitors" + type = "string" } +variable "provider" { + description = "Cloud provider which the monitor and its based metric depend on" + type = "string" + default = "azure" +} + +variable "service" { + description = "Service monitored by this set of monitors" + type = "string" + default = "storage" + +# Global DataDog variable "delay" { description = "Delay in seconds for the metric evaluation" default = 600 @@ -27,7 +30,12 @@ variable "message" { description = "Message sent when an alert is triggered" } -## IOT hubs +variable "use_filter_tags" { + description = "Filter the data with service tags if true" + default = "true" +} + +# Azure IOT hubs specific variable "jobs_failed_threshold_warning" { description = "Jobs Failed rate limit (warning threshold)" default = 0 diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 1ee29a3..4398f5f 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -2,7 +2,7 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_appservices:enabled,subscription_id:%s,env:%s", var.subscription_id,var.environment) : var.subscription_id}" + filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "${data.template_file.filter.rendered}"}" } } @@ -12,9 +12,9 @@ resource "datadog_monitor" "too_many_jobs_failed" { query = < ${var.jobs_failed_threshold_critical} EOF type = "query alert" @@ -34,6 +34,8 @@ resource "datadog_monitor" "too_many_jobs_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_list_jobs_failed" { @@ -42,9 +44,9 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { query = < ${var.listjobs_failed_threshold_critical} EOF type = "query alert" @@ -64,6 +66,8 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_query_jobs_failed" { @@ -72,9 +76,9 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { query = < ${var.queryjobs_failed_threshold_critical} EOF type = "query alert" @@ -94,6 +98,8 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "status" { @@ -101,7 +107,7 @@ resource "datadog_monitor" "status" { message = "${var.message}" query = < ${var.c2d_methods_failed_threshold_critical} EOF type = "query alert" @@ -166,6 +176,8 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_c2d_twin_read_failed" { @@ -174,9 +186,9 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { query = < ${var.c2d_twin_read_failed_threshold_critical} EOF type = "query alert" @@ -196,6 +208,8 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_c2d_twin_update_failed" { @@ -204,9 +218,9 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { query = < ${var.c2d_twin_update_failed_threshold_critical} EOF type = "query alert" @@ -226,6 +240,8 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_twin_read_failed" { @@ -234,9 +250,9 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { query = < ${var.d2c_twin_read_failed_threshold_critical} EOF type = "query alert" @@ -256,6 +272,8 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_twin_update_failed" { @@ -264,9 +282,9 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { query = < ${var.d2c_twin_update_failed_threshold_critical} EOF type = "query alert" @@ -286,6 +304,8 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { @@ -294,7 +314,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { query = < ${var.d2c_telemetry_egress_dropped_threshold_critical} EOF type = "query alert" @@ -314,6 +334,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { @@ -322,7 +344,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { query = < ${var.d2c_telemetry_egress_orphaned_threshold_critical} EOF type = "query alert" @@ -342,6 +364,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { @@ -350,7 +374,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { query = < ${var.d2c_telemetry_egress_invalid_threshold_critical} EOF type = "query alert" @@ -370,6 +394,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { @@ -378,7 +404,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { query = < ${var.d2c_telemetry_egress_fallback_threshold_critical} EOF type = "query alert" @@ -398,6 +424,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { @@ -406,8 +434,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { query = < 0 EOF type = "query alert" @@ -422,4 +450,6 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { require_full_window = true new_host_delay = "${var.delay}" no_data_timeframe = 20 + + tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] } From 2593e5fac4e9da58531e7195af53e5c62802c424 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 3 Nov 2017 20:47:29 +0100 Subject: [PATCH 12/14] MON-80 update readme --- cloud/azure/iothubs/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md index a0e4be5..362e226 100644 --- a/cloud/azure/iothubs/README.md +++ b/cloud/azure/iothubs/README.md @@ -64,8 +64,10 @@ Inputs | listjobs_failed_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | | listjobs_failed_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | | message | Message sent when an alert is triggered | string | - | yes | +| provider | What is the monitored provider | string | azure | no | | queryjobs_failed_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | | queryjobs_failed_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | +| service | What is the monitored service | string | storage | no | | subscription_id | Subscription ID used to tag monitors | string | - | yes | Related documentation From e0fa47008ae60aa0ba97eb6b2c33d40b0c2e596a Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Mon, 6 Nov 2017 10:30:00 +0100 Subject: [PATCH 13/14] MON-80 Update variables' names --- cloud/azure/iothubs/README.md | 57 ++++----- cloud/azure/iothubs/inputs.tf | 125 ++++++++++---------- cloud/azure/iothubs/monitors-iothubs.tf | 147 +++++++++++++----------- 3 files changed, 173 insertions(+), 156 deletions(-) diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md index 362e226..339b357 100644 --- a/cloud/azure/iothubs/README.md +++ b/cloud/azure/iothubs/README.md @@ -1,5 +1,5 @@ Azure IOT Hubs DataDog monitors -============================ +=============================== How to use this module ---------------------- @@ -39,36 +39,37 @@ Inputs | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| -| c2d_methods_failed_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no | -| c2d_methods_failed_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no | -| c2d_twin_read_failed_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no | -| c2d_twin_read_failed_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | -| c2d_twin_update_failed_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | -| c2d_twin_update_failed_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | -| d2c_telemetry_egress_dropped_threshold_critical | D2C Telemetry Dropped Failed limit (critical threshold) | string | `1000` | no | -| d2c_telemetry_egress_dropped_threshold_warning | D2C Telemetry Dropped Failed limit (warning threshold) | string | `500` | no | -| d2c_telemetry_egress_fallback_threshold_critical | D2C Telemetry Fallback Failed limit (critical threshold) | string | `1000` | no | -| d2c_telemetry_egress_fallback_threshold_warning | D2C Telemetry Fallback Failed limit (warning threshold) | string | `500` | no | -| d2c_telemetry_egress_invalid_threshold_critical | D2C Telemetry Invalid Failed limit (critical threshold) | string | `1000` | no | -| d2c_telemetry_egress_invalid_threshold_warning | D2C Telemetry Invalid Failed limit (warning threshold) | string | `500` | no | -| d2c_telemetry_egress_orphaned_threshold_critical | D2C Telemetry Orphaned Failed limit (critical threshold) | string | `1000` | no | -| d2c_telemetry_egress_orphaned_threshold_warning | D2C Telemetry Orphaned Failed limit (warning threshold) | string | `500` | no | -| d2c_twin_read_failed_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no | -| d2c_twin_read_failed_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no | -| d2c_twin_update_failed_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no | -| d2c_twin_update_failed_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no | | delay | Delay in seconds for the metric evaluation | string | `600` | no | +| dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no | +| dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no | | environment | Architecture Environment | string | - | yes | -| jobs_failed_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no | -| jobs_failed_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no | -| listjobs_failed_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | -| listjobs_failed_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | +| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no | +| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no | +| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no | +| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no | +| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | +| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | +| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | +| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | +| fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no | +| fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no | +| invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no | +| invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no | | message | Message sent when an alert is triggered | string | - | yes | -| provider | What is the monitored provider | string | azure | no | -| queryjobs_failed_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | -| queryjobs_failed_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | -| service | What is the monitored service | string | storage | no | -| subscription_id | Subscription ID used to tag monitors | string | - | yes | +| orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no | +| orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no | +| provider | Cloud provider which the monitor and its based metric depend on | string | `azure` | no | +| service | Service monitored by this set of monitors | string | `storage` | no | +| subscription_id | Azure account id used as filter for monitors | string | - | yes | +| use_filter_tags | Filter the data with service tags if true | string | `true` | no | Related documentation --------------------- diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 1efabc3..01c77fb 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -6,24 +6,25 @@ variable "environment" { variable "subscription_id" { description = "Azure account id used as filter for monitors" - type = "string" + type = "string" } variable "provider" { description = "Cloud provider which the monitor and its based metric depend on" - type = "string" - default = "azure" + type = "string" + default = "azure" } variable "service" { description = "Service monitored by this set of monitors" - type = "string" - default = "storage" + type = "string" + default = "storage" +} # Global DataDog variable "delay" { description = "Delay in seconds for the metric evaluation" - default = 600 + default = 600 } variable "message" { @@ -36,122 +37,122 @@ variable "use_filter_tags" { } # Azure IOT hubs specific -variable "jobs_failed_threshold_warning" { +variable "failed_jobs_rate_threshold_warning" { description = "Jobs Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "jobs_failed_threshold_critical" { +variable "failed_jobs_rate_threshold_critical" { description = "Jobs Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "listjobs_failed_threshold_warning" { +variable "failed_listjobs_rate_threshold_warning" { description = "ListJobs Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "listjobs_failed_threshold_critical" { +variable "failed_listjobs_rate_threshold_critical" { description = "ListJobs Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "queryjobs_failed_threshold_warning" { +variable "failed_queryjobs_rate_threshold_warning" { description = "QueryJobs Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "queryjobs_failed_threshold_critical" { +variable "failed_queryjobs_rate_threshold_critical" { description = "QueryJobs Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "c2d_methods_failed_threshold_warning" { +variable "failed_c2d_methods_rate_threshold_warning" { description = "C2D Methods Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "c2d_methods_failed_threshold_critical" { +variable "failed_c2d_methods_rate_threshold_critical" { description = "C2D Methods Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "c2d_twin_read_failed_threshold_warning" { +variable "failed_c2d_twin_read_rate_threshold_warning" { description = "C2D Twin Read Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "c2d_twin_read_failed_threshold_critical" { +variable "failed_c2d_twin_read_rate_threshold_critical" { description = "C2D Twin Read Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "c2d_twin_update_failed_threshold_warning" { +variable "failed_c2d_twin_update_rate_threshold_warning" { description = "C2D Twin Update Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "c2d_twin_update_failed_threshold_critical" { +variable "failed_c2d_twin_update_rate_threshold_critical" { description = "C2D Twin Update Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "d2c_twin_read_failed_threshold_warning" { +variable "failed_d2c_twin_read_rate_threshold_warning" { description = "D2C Twin Read Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "d2c_twin_read_failed_threshold_critical" { +variable "failed_d2c_twin_read_rate_threshold_critical" { description = "D2C Twin Read Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "d2c_twin_update_failed_threshold_warning" { +variable "failed_d2c_twin_update_rate_threshold_warning" { description = "D2C Twin Update Failed rate limit (warning threshold)" - default = 0 + default = 0 } -variable "d2c_twin_update_failed_threshold_critical" { +variable "failed_d2c_twin_update_rate_threshold_critical" { description = "D2C Twin Update Failed rate limit (critical threshold)" - default = 10 + default = 10 } -variable "d2c_telemetry_egress_dropped_threshold_warning" { - description = "D2C Telemetry Dropped Failed limit (warning threshold)" - default = 500 +variable "dropped_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Dropped limit (warning threshold)" + default = 500 } -variable "d2c_telemetry_egress_dropped_threshold_critical" { - description = "D2C Telemetry Dropped Failed limit (critical threshold)" - default = 1000 +variable "dropped_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Dropped limit (critical threshold)" + default = 1000 } -variable "d2c_telemetry_egress_orphaned_threshold_warning" { - description = "D2C Telemetry Orphaned Failed limit (warning threshold)" - default = 500 +variable "orphaned_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Orphaned limit (warning threshold)" + default = 500 } -variable "d2c_telemetry_egress_orphaned_threshold_critical" { - description = "D2C Telemetry Orphaned Failed limit (critical threshold)" - default = 1000 +variable "orphaned_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Orphaned limit (critical threshold)" + default = 1000 } -variable "d2c_telemetry_egress_invalid_threshold_warning" { - description = "D2C Telemetry Invalid Failed limit (warning threshold)" - default = 500 +variable "invalid_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Invalid limit (warning threshold)" + default = 500 } -variable "d2c_telemetry_egress_invalid_threshold_critical" { - description = "D2C Telemetry Invalid Failed limit (critical threshold)" - default = 1000 +variable "invalid_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Invalid limit (critical threshold)" + default = 1000 } -variable "d2c_telemetry_egress_fallback_threshold_warning" { - description = "D2C Telemetry Fallback Failed limit (warning threshold)" - default = 500 +variable "fallback_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Fallback limit (warning threshold)" + default = 500 } -variable "d2c_telemetry_egress_fallback_threshold_critical" { - description = "D2C Telemetry Fallback Failed limit (critical threshold)" - default = 1000 +variable "fallback_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Fallback limit (critical threshold)" + default = 1000 } diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 4398f5f..d7fb7e3 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -15,13 +15,14 @@ resource "datadog_monitor" "too_many_jobs_failed" { avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() ) - ) * 100 > ${var.jobs_failed_threshold_critical} + ) * 100 > ${var.failed_jobs_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.jobs_failed_threshold_warning}" - critical = "${var.jobs_failed_threshold_critical}" + warning = "${var.failed_jobs_rate_threshold_warning}" + critical = "${var.failed_jobs_rate_threshold_critical}" } notify_no_data = false @@ -35,7 +36,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_list_jobs_failed" { @@ -47,13 +48,14 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { avg:azure.devices_iothubs.jobs.list_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() ) - ) * 100 > ${var.listjobs_failed_threshold_critical} + ) * 100 > ${var.failed_listjobs_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.listjobs_failed_threshold_warning}" - critical = "${var.listjobs_failed_threshold_critical}" + warning = "${var.failed_listjobs_rate_threshold_warning}" + critical = "${var.failed_listjobs_rate_threshold_critical}" } notify_no_data = false @@ -67,7 +69,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_query_jobs_failed" { @@ -79,13 +81,14 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { avg:azure.devices_iothubs.jobs.query_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{${data.template_file.filter.rendered}} by {resource_group,name}.as_count() ) - ) * 100 > ${var.queryjobs_failed_threshold_critical} + ) * 100 > ${var.failed_queryjobs_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.queryjobs_failed_threshold_warning}" - critical = "${var.queryjobs_failed_threshold_critical}" + warning = "${var.failed_queryjobs_rate_threshold_warning}" + critical = "${var.failed_queryjobs_rate_threshold_critical}" } notify_no_data = false @@ -99,7 +102,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "status" { @@ -109,7 +112,8 @@ resource "datadog_monitor" "status" { query = < ${var.c2d_methods_failed_threshold_critical} + ) * 100 > ${var.failed_c2d_methods_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.c2d_methods_failed_threshold_warning}" - critical = "${var.c2d_methods_failed_threshold_critical}" + warning = "${var.failed_c2d_methods_rate_threshold_warning}" + critical = "${var.failed_c2d_methods_rate_threshold_critical}" } notify_no_data = false @@ -177,7 +183,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_c2d_twin_read_failed" { @@ -189,13 +195,14 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() ) - ) * 100 > ${var.c2d_twin_read_failed_threshold_critical} + ) * 100 > ${var.failed_c2d_twin_read_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.c2d_twin_read_failed_threshold_warning}" - critical = "${var.c2d_twin_read_failed_threshold_critical}" + warning = "${var.failed_c2d_twin_read_rate_threshold_warning}" + critical = "${var.failed_c2d_twin_read_rate_threshold_critical}" } notify_no_data = false @@ -209,7 +216,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_c2d_twin_update_failed" { @@ -221,13 +228,14 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() ) - ) * 100 > ${var.c2d_twin_update_failed_threshold_critical} + ) * 100 > ${var.failed_c2d_twin_update_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.c2d_twin_update_failed_threshold_warning}" - critical = "${var.c2d_twin_update_failed_threshold_critical}" + warning = "${var.failed_c2d_twin_update_rate_threshold_warning}" + critical = "${var.failed_c2d_twin_update_rate_threshold_critical}" } notify_no_data = false @@ -241,7 +249,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_twin_read_failed" { @@ -253,13 +261,14 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() ) - ) * 100 > ${var.d2c_twin_read_failed_threshold_critical} + ) * 100 > ${var.failed_d2c_twin_read_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.d2c_twin_read_failed_threshold_warning}" - critical = "${var.d2c_twin_read_failed_threshold_critical}" + warning = "${var.failed_d2c_twin_read_rate_threshold_warning}" + critical = "${var.failed_d2c_twin_read_rate_threshold_critical}" } notify_no_data = false @@ -273,7 +282,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_twin_update_failed" { @@ -285,13 +294,14 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() ) - ) * 100 > ${var.d2c_twin_update_failed_threshold_critical} + ) * 100 > ${var.failed_d2c_twin_update_rate_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.d2c_twin_update_failed_threshold_warning}" - critical = "${var.d2c_twin_update_failed_threshold_critical}" + warning = "${var.failed_d2c_twin_update_rate_threshold_warning}" + critical = "${var.failed_d2c_twin_update_rate_threshold_critical}" } notify_no_data = false @@ -305,7 +315,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { @@ -315,13 +325,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { query = < ${var.d2c_telemetry_egress_dropped_threshold_critical} + ) > ${var.dropped_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.d2c_telemetry_egress_dropped_threshold_warning}" - critical = "${var.d2c_telemetry_egress_dropped_threshold_critical}" + warning = "${var.dropped_d2c_telemetry_egress_threshold_warning}" + critical = "${var.dropped_d2c_telemetry_egress_threshold_critical}" } notify_no_data = false @@ -335,7 +346,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { @@ -345,13 +356,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { query = < ${var.d2c_telemetry_egress_orphaned_threshold_critical} + ) > ${var.orphaned_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.d2c_telemetry_egress_orphaned_threshold_warning}" - critical = "${var.d2c_telemetry_egress_orphaned_threshold_critical}" + warning = "${var.orphaned_d2c_telemetry_egress_threshold_warning}" + critical = "${var.orphaned_d2c_telemetry_egress_threshold_critical}" } notify_no_data = false @@ -365,7 +377,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { @@ -375,13 +387,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { query = < ${var.d2c_telemetry_egress_invalid_threshold_critical} + ) > ${var.invalid_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.d2c_telemetry_egress_invalid_threshold_warning}" - critical = "${var.d2c_telemetry_egress_invalid_threshold_critical}" + warning = "${var.invalid_d2c_telemetry_egress_threshold_warning}" + critical = "${var.invalid_d2c_telemetry_egress_threshold_critical}" } notify_no_data = false @@ -395,7 +408,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { @@ -405,13 +418,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { query = < ${var.d2c_telemetry_egress_fallback_threshold_critical} + ) > ${var.fallback_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + + type = "query alert" thresholds { - warning = "${var.d2c_telemetry_egress_fallback_threshold_warning}" - critical = "${var.d2c_telemetry_egress_fallback_threshold_critical}" + warning = "${var.fallback_d2c_telemetry_egress_threshold_warning}" + critical = "${var.fallback_d2c_telemetry_egress_threshold_critical}" } notify_no_data = false @@ -425,7 +439,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { @@ -438,7 +452,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { avg:azure.devices_iothubs.d2c.telemetry.ingress.success{${data.template_file.filter.rendered}} by {name,resource_group}.as_count() ) > 0 EOF - type = "query alert" + + type = "query alert" notify_no_data = false evaluation_delay = "${var.delay}" @@ -451,5 +466,5 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}","resource:${var.service}","team:${var.provider}"] + tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] } From 279778ed888f891f8a30d033b004275757c904ff Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Thu, 23 Nov 2017 15:12:54 +0100 Subject: [PATCH 14/14] MON-80 Normalize monitors --- cloud/azure/iothubs/README.md | 11 ++--- cloud/azure/iothubs/inputs.tf | 26 +++-------- cloud/azure/iothubs/monitors-iothubs.tf | 62 ++++++++++++------------- 3 files changed, 42 insertions(+), 57 deletions(-) diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md index 339b357..5187715 100644 --- a/cloud/azure/iothubs/README.md +++ b/cloud/azure/iothubs/README.md @@ -8,9 +8,8 @@ How to use this module module "iothubs" { source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors" - message = "${module.datadog-message-alerting.alerting-message}" - environment = "${var.environment}" - subscription_id = "${var.subscription_id}" + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" } ``` @@ -61,15 +60,13 @@ Inputs | failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | | fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no | | fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no | | invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no | | message | Message sent when an alert is triggered | string | - | yes | | orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no | | orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no | -| provider | Cloud provider which the monitor and its based metric depend on | string | `azure` | no | -| service | Service monitored by this set of monitors | string | `storage` | no | -| subscription_id | Azure account id used as filter for monitors | string | - | yes | -| use_filter_tags | Filter the data with service tags if true | string | `true` | no | Related documentation --------------------- diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index 01c77fb..1b1348f 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -4,23 +4,6 @@ variable "environment" { type = "string" } -variable "subscription_id" { - description = "Azure account id used as filter for monitors" - type = "string" -} - -variable "provider" { - description = "Cloud provider which the monitor and its based metric depend on" - type = "string" - default = "azure" -} - -variable "service" { - description = "Service monitored by this set of monitors" - type = "string" - default = "storage" -} - # Global DataDog variable "delay" { description = "Delay in seconds for the metric evaluation" @@ -31,11 +14,16 @@ variable "message" { description = "Message sent when an alert is triggered" } -variable "use_filter_tags" { - description = "Filter the data with service tags if true" +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" default = "true" } +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + # Azure IOT hubs specific variable "failed_jobs_rate_threshold_warning" { description = "Jobs Failed rate limit (warning threshold)" diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index d7fb7e3..6e1f926 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -2,7 +2,7 @@ data "template_file" "filter" { template = "$${filter}" vars { - filter = "${var.use_filter_tags == "true" ? format("dd_monitoring:enabled,dd_azure_storage:enabled,env:%s", var.environment) : "${data.template_file.filter.rendered}"}" + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_iothub:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" } } @@ -18,7 +18,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { ) * 100 > ${var.failed_jobs_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_jobs_rate_threshold_warning}" @@ -36,7 +36,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_list_jobs_failed" { @@ -51,7 +51,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { ) * 100 > ${var.failed_listjobs_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_listjobs_rate_threshold_warning}" @@ -69,7 +69,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_query_jobs_failed" { @@ -84,7 +84,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { ) * 100 > ${var.failed_queryjobs_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_queryjobs_rate_threshold_warning}" @@ -102,7 +102,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "status" { @@ -113,7 +113,7 @@ resource "datadog_monitor" "status" { avg(last_5m):avg:azure.devices_iothubs.status{${data.template_file.filter.rendered}} by {name,resource_group} < 1 EOF - type = "query alert" + type = "metric alert" notify_no_data = true evaluation_delay = "${var.delay}" @@ -126,7 +126,7 @@ resource "datadog_monitor" "status" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "total_devices" { @@ -137,7 +137,7 @@ resource "datadog_monitor" "total_devices" { avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{${data.template_file.filter.rendered}} by {name,resource_group} == 0 EOF - type = "query alert" + type = "metric alert" notify_no_data = true evaluation_delay = "${var.delay}" @@ -150,7 +150,7 @@ resource "datadog_monitor" "total_devices" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_c2d_methods_failed" { @@ -165,7 +165,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { ) * 100 > ${var.failed_c2d_methods_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_c2d_methods_rate_threshold_warning}" @@ -183,7 +183,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_c2d_twin_read_failed" { @@ -198,7 +198,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { ) * 100 > ${var.failed_c2d_twin_read_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_c2d_twin_read_rate_threshold_warning}" @@ -216,7 +216,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_c2d_twin_update_failed" { @@ -231,7 +231,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { ) * 100 > ${var.failed_c2d_twin_update_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_c2d_twin_update_rate_threshold_warning}" @@ -249,7 +249,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_d2c_twin_read_failed" { @@ -264,7 +264,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { ) * 100 > ${var.failed_d2c_twin_read_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_d2c_twin_read_rate_threshold_warning}" @@ -282,7 +282,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_d2c_twin_update_failed" { @@ -297,7 +297,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { ) * 100 > ${var.failed_d2c_twin_update_rate_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.failed_d2c_twin_update_rate_threshold_warning}" @@ -315,7 +315,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { @@ -328,7 +328,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { ) > ${var.dropped_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.dropped_d2c_telemetry_egress_threshold_warning}" @@ -346,7 +346,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { @@ -359,7 +359,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { ) > ${var.orphaned_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.orphaned_d2c_telemetry_egress_threshold_warning}" @@ -377,7 +377,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { @@ -390,7 +390,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { ) > ${var.invalid_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.invalid_d2c_telemetry_egress_threshold_warning}" @@ -408,7 +408,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { @@ -421,7 +421,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { ) > ${var.fallback_d2c_telemetry_egress_threshold_critical} EOF - type = "query alert" + type = "metric alert" thresholds { warning = "${var.fallback_d2c_telemetry_egress_threshold_warning}" @@ -439,7 +439,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] } resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { @@ -453,7 +453,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { ) > 0 EOF - type = "query alert" + type = "metric alert" notify_no_data = false evaluation_delay = "${var.delay}" @@ -466,5 +466,5 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { new_host_delay = "${var.delay}" no_data_timeframe = 20 - tags = ["env:${var.environment}", "resource:${var.service}", "team:${var.provider}"] + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] }