MON-335 Fix IoTHub monitor for alert recovery on no data

This commit is contained in:
Laurent Piroelle 2018-11-27 10:23:56 +01:00
parent 3ce5b9c829
commit 93eccaf593

View File

@ -4,11 +4,12 @@ resource "datadog_monitor" "too_many_jobs_failed" {
message = "${coalesce(var.failed_jobs_rate_message, var.message)}"
query = <<EOF
${var.failed_jobs_rate_time_aggregator}(${var.failed_jobs_rate_timeframe}):(
${var.failed_jobs_rate_time_aggregator}(${var.failed_jobs_rate_timeframe}):
default(
default(avg:azure.devices_iothubs.jobs.failed{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
default(avg:azure.devices_iothubs.jobs.failed{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.jobs.completed{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
) * 100 > ${var.failed_jobs_rate_threshold_critical}
* 100 , 0) > ${var.failed_jobs_rate_threshold_critical}
EOF
type = "metric alert"
@ -20,7 +21,7 @@ resource "datadog_monitor" "too_many_jobs_failed" {
silenced = "${var.failed_jobs_rate_silenced}"
notify_no_data = false
notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0
notify_audit = false
@ -39,11 +40,12 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
message = "${coalesce(var.failed_listjobs_rate_message, var.message)}"
query = <<EOF
${var.failed_listjobs_rate_time_aggregator}(${var.failed_listjobs_rate_timeframe}):(
${var.failed_listjobs_rate_time_aggregator}(${var.failed_listjobs_rate_timeframe}):
default(
default(avg:azure.devices_iothubs.jobs.list_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_rate(), 0) / (
default(avg:azure.devices_iothubs.jobs.list_jobs.success{${var.filter_tags}} by {resource_group,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.jobs.list_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_rate(), 0) )
) * 100 > ${var.failed_listjobs_rate_threshold_critical}
* 100, 0) > ${var.failed_listjobs_rate_threshold_critical}
EOF
type = "metric alert"
@ -55,7 +57,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
silenced = "${var.failed_listjobs_rate_silenced}"
notify_no_data = false
notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0
notify_audit = false
@ -74,11 +76,12 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
message = "${coalesce(var.failed_queryjobs_rate_message, var.message)}"
query = <<EOF
${var.failed_queryjobs_rate_time_aggregator}(${var.failed_queryjobs_rate_timeframe}):(
${var.failed_queryjobs_rate_time_aggregator}(${var.failed_queryjobs_rate_timeframe}):
default(
default(avg:azure.devices_iothubs.jobs.query_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_rate(), 0) / (
default(avg:azure.devices_iothubs.jobs.query_jobs.success{${var.filter_tags}} by {resource_group,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.jobs.query_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_rate(), 0) )
) * 100 > ${var.failed_queryjobs_rate_threshold_critical}
* 100, 0) > ${var.failed_queryjobs_rate_threshold_critical}
EOF
type = "metric alert"
@ -90,7 +93,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
silenced = "${var.failed_queryjobs_rate_silenced}"
notify_no_data = false
notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0
notify_audit = false
@ -165,11 +168,12 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
message = "${coalesce(var.failed_c2d_methods_rate_message, var.message)}"
query = <<EOF
${var.failed_c2d_methods_rate_time_aggregator}(${var.failed_c2d_methods_rate_timeframe}):(
${var.failed_c2d_methods_rate_time_aggregator}(${var.failed_c2d_methods_rate_timeframe}):
default(
default(avg:azure.devices_iothubs.c2d.methods.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
default(avg:azure.devices_iothubs.c2d.methods.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.c2d.methods.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
) * 100 > ${var.failed_c2d_methods_rate_threshold_critical}
* 100, 0) > ${var.failed_c2d_methods_rate_threshold_critical}
EOF
type = "metric alert"
@ -181,7 +185,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
silenced = "${var.failed_c2d_methods_rate_silenced}"
notify_no_data = false
notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0
notify_audit = false
@ -200,11 +204,12 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
message = "${coalesce(var.failed_c2d_twin_read_rate_message, var.message)}"
query = <<EOF
${var.failed_c2d_twin_read_rate_time_aggregator}(${var.failed_c2d_twin_read_rate_timeframe}):(
${var.failed_c2d_twin_read_rate_time_aggregator}(${var.failed_c2d_twin_read_rate_timeframe}):
default(
default(avg:azure.devices_iothubs.c2d.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
default(avg:azure.devices_iothubs.c2d.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.c2d.twin.read.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
) * 100 > ${var.failed_c2d_twin_read_rate_threshold_critical}
* 100, 0) > ${var.failed_c2d_twin_read_rate_threshold_critical}
EOF
type = "metric alert"
@ -216,7 +221,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
silenced = "${var.failed_c2d_twin_read_rate_silenced}"
notify_no_data = false
notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0
notify_audit = false
@ -235,11 +240,12 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
message = "${coalesce(var.failed_c2d_twin_update_rate_message, var.message)}"
query = <<EOF
${var.failed_c2d_twin_update_rate_time_aggregator}(${var.failed_c2d_twin_update_rate_timeframe}):(
${var.failed_c2d_twin_update_rate_time_aggregator}(${var.failed_c2d_twin_update_rate_timeframe}):
default(
default(avg:azure.devices_iothubs.c2d.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
default(avg:azure.devices_iothubs.c2d.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.c2d.twin.update.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
) * 100 > ${var.failed_c2d_twin_update_rate_threshold_critical}
* 100, 0) > ${var.failed_c2d_twin_update_rate_threshold_critical}
EOF
type = "metric alert"
@ -251,7 +257,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
silenced = "${var.failed_c2d_twin_update_rate_silenced}"
notify_no_data = false
notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0
notify_audit = false
@ -270,11 +276,12 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
message = "${coalesce(var.failed_d2c_twin_read_rate_message, var.message)}"
query = <<EOF
${var.failed_d2c_twin_read_rate_time_aggregator}(${var.failed_d2c_twin_read_rate_timeframe}):(
${var.failed_d2c_twin_read_rate_time_aggregator}(${var.failed_d2c_twin_read_rate_timeframe}):
default(
default(avg:azure.devices_iothubs.d2c.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
default(avg:azure.devices_iothubs.d2c.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.d2c.twin.read.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
) * 100 > ${var.failed_d2c_twin_read_rate_threshold_critical}
* 100, 0) > ${var.failed_d2c_twin_read_rate_threshold_critical}
EOF
type = "metric alert"
@ -286,7 +293,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
silenced = "${var.failed_d2c_twin_read_rate_silenced}"
notify_no_data = false
notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0
notify_audit = false
@ -305,11 +312,12 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
message = "${coalesce(var.failed_d2c_twin_update_rate_message, var.message)}"
query = <<EOF
${var.failed_d2c_twin_update_rate_time_aggregator}(${var.failed_d2c_twin_update_rate_timeframe}):(
${var.failed_d2c_twin_update_rate_time_aggregator}(${var.failed_d2c_twin_update_rate_timeframe}):
default(
default(avg:azure.devices_iothubs.d2c.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
default(avg:azure.devices_iothubs.d2c.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.d2c.twin.update.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
) * 100 > ${var.failed_d2c_twin_update_rate_threshold_critical}
* 100, 0) > ${var.failed_d2c_twin_update_rate_threshold_critical}
EOF
type = "metric alert"
@ -321,7 +329,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
silenced = "${var.failed_d2c_twin_update_rate_silenced}"
notify_no_data = false
notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0
notify_audit = false
@ -340,13 +348,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
message = "${coalesce(var.dropped_d2c_telemetry_egress_message, var.message)}"
query = <<EOF
${var.dropped_d2c_telemetry_egress_time_aggregator}(${var.dropped_d2c_telemetry_egress_timeframe}): (
${var.dropped_d2c_telemetry_egress_time_aggregator}(${var.dropped_d2c_telemetry_egress_timeframe}):
default(
default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
) * 100 > ${var.dropped_d2c_telemetry_egress_rate_threshold_critical}
* 100, 0) > ${var.dropped_d2c_telemetry_egress_rate_threshold_critical}
EOF
type = "metric alert"
@ -358,7 +367,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
silenced = "${var.dropped_d2c_telemetry_egress_silenced}"
notify_no_data = false
notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0
notify_audit = false
@ -377,13 +386,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
message = "${coalesce(var.orphaned_d2c_telemetry_egress_message, var.message)}"
query = <<EOF
${var.orphaned_d2c_telemetry_egress_time_aggregator}(${var.orphaned_d2c_telemetry_egress_timeframe}): (
${var.orphaned_d2c_telemetry_egress_time_aggregator}(${var.orphaned_d2c_telemetry_egress_timeframe}):
default(
default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
) * 100 > ${var.orphaned_d2c_telemetry_egress_rate_threshold_critical}
* 100, 0) > ${var.orphaned_d2c_telemetry_egress_rate_threshold_critical}
EOF
type = "metric alert"
@ -395,7 +405,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
silenced = "${var.orphaned_d2c_telemetry_egress_silenced}"
notify_no_data = false
notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0
notify_audit = false
@ -414,13 +424,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
message = "${coalesce(var.invalid_d2c_telemetry_egress_message, var.message)}"
query = <<EOF
${var.invalid_d2c_telemetry_egress_time_aggregator}(${var.invalid_d2c_telemetry_egress_timeframe}): (
${var.invalid_d2c_telemetry_egress_time_aggregator}(${var.invalid_d2c_telemetry_egress_timeframe}):
default(
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
default(avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
) * 100 > ${var.invalid_d2c_telemetry_egress_rate_threshold_critical}
* 100, 0) > ${var.invalid_d2c_telemetry_egress_rate_threshold_critical}
EOF
type = "metric alert"
@ -432,7 +443,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
silenced = "${var.invalid_d2c_telemetry_egress_silenced}"
notify_no_data = false
notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0
notify_audit = false
@ -451,17 +462,18 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
message = "${coalesce(var.too_many_d2c_telemetry_ingress_nosent_message, var.message)}"
query = <<EOF
sum(${var.too_many_d2c_telemetry_ingress_nosent_timeframe}): (
sum(${var.too_many_d2c_telemetry_ingress_nosent_timeframe}):
default(
avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{${var.filter_tags}} by {resource_group,region,name}.as_count() -
avg:azure.devices_iothubs.d2c.telemetry.ingress.success{${var.filter_tags}} by {resource_group,region,name}.as_count()
) > 0
, 0) > 0
EOF
type = "metric alert"
silenced = "${var.too_many_d2c_telemetry_ingress_nosent_silenced}"
notify_no_data = false
notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0
notify_audit = false