From 715fe91a7bc5faf1857d03c4db1bc99233648a05 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Wed, 10 Oct 2018 17:40:15 +0200 Subject: [PATCH] MON-332 - Filter-tags added on Azure IOTHub monitors --- cloud/azure/iothubs/README.md | 4 +- cloud/azure/iothubs/inputs.tf | 14 ++- cloud/azure/iothubs/modules.tf | 8 ++ cloud/azure/iothubs/monitors-iothubs.tf | 146 +++++++++++------------- 4 files changed, 90 insertions(+), 82 deletions(-) create mode 100644 cloud/azure/iothubs/modules.tf diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md index 1636fd5..c739f0c 100644 --- a/cloud/azure/iothubs/README.md +++ b/cloud/azure/iothubs/README.md @@ -109,7 +109,9 @@ Creates DataDog monitors with the following checks: | failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no | | failed_queryjobs_rate_time_aggregator | Monitor aggregator for IoT Hub failed query jobs [available values: min, max, sum or avg] | string | `min` | no | | failed_queryjobs_rate_timeframe | Monitor timeframe for IoT Hub failed query jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | -| filter_tags | Tags used for filtering | string | `*` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_custom_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | invalid_d2c_telemetry_egress_enabled | Flag to enable IoT Hub invalid d2c telemetry monitor | string | `true` | no | | invalid_d2c_telemetry_egress_extra_tags | Extra tags for IoT Hub invalid d2c telemetry monitor | list | `[]` | no | | invalid_d2c_telemetry_egress_message | Custom message for IoT Hub invalid d2c telemetry monitor | string | `` | no | diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf index e85cb9f..93d7c41 100644 --- a/cloud/azure/iothubs/inputs.tf +++ b/cloud/azure/iothubs/inputs.tf @@ -19,11 +19,21 @@ variable "message" { description = "Message sent when an alert is triggered" } -variable "filter_tags" { - description = "Tags used for filtering" +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" default = "*" } +variable "filter_tags_custom_excluded" { + description = "Tags excluded for custom filtering when filter_tags_use_defaults is false" + default = "" +} + # IOT Hub specific variables variable "status_silenced" { description = "Groups to mute for IoT Hub status monitor" diff --git a/cloud/azure/iothubs/modules.tf b/cloud/azure/iothubs/modules.tf new file mode 100644 index 0000000..cb481a0 --- /dev/null +++ b/cloud/azure/iothubs/modules.tf @@ -0,0 +1,8 @@ +module "filter-tags" { + source = "../../../common/filter-tags" + + environment = "${var.environment}" + resource = "azure_iothubs" + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" +} diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 1cc6f7f..1054119 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -4,12 +4,11 @@ resource "datadog_monitor" "too_many_jobs_failed" { message = "${coalesce(var.failed_jobs_rate_message, var.message)}" query = < ${var.failed_jobs_rate_threshold_critical} + ${var.failed_jobs_rate_time_aggregator}(${var.failed_jobs_rate_timeframe}):( + default(avg:azure.devices_iothubs.jobs.failed${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.jobs.failed${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.jobs.completed${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) ) + ) * 100 > ${var.failed_jobs_rate_threshold_critical} EOF type = "metric alert" @@ -40,12 +39,11 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { message = "${coalesce(var.failed_listjobs_rate_message, var.message)}" query = < ${var.failed_listjobs_rate_threshold_critical} + ${var.failed_listjobs_rate_time_aggregator}(${var.failed_listjobs_rate_timeframe}):( + default(avg:azure.devices_iothubs.jobs.list_jobs.failure${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.jobs.list_jobs.success${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.jobs.list_jobs.failure${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) ) + ) * 100 > ${var.failed_listjobs_rate_threshold_critical} EOF type = "metric alert" @@ -76,12 +74,11 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { message = "${coalesce(var.failed_queryjobs_rate_message, var.message)}" query = < ${var.failed_queryjobs_rate_threshold_critical} + ${var.failed_queryjobs_rate_time_aggregator}(${var.failed_queryjobs_rate_timeframe}):( + default(avg:azure.devices_iothubs.jobs.query_jobs.failure${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.jobs.query_jobs.success${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.jobs.query_jobs.failure${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) ) + ) * 100 > ${var.failed_queryjobs_rate_threshold_critical} EOF type = "metric alert" @@ -113,7 +110,7 @@ resource "datadog_monitor" "status" { query = < ${var.failed_c2d_methods_rate_threshold_critical} + ${var.failed_c2d_methods_rate_time_aggregator}(${var.failed_c2d_methods_rate_timeframe}):( + default(avg:azure.devices_iothubs.c2d.methods.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.c2d.methods.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.c2d.methods.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) ) + ) * 100 > ${var.failed_c2d_methods_rate_threshold_critical} EOF type = "metric alert" @@ -204,12 +200,11 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { message = "${coalesce(var.failed_c2d_twin_read_rate_message, var.message)}" query = < ${var.failed_c2d_twin_read_rate_threshold_critical} + ${var.failed_c2d_twin_read_rate_time_aggregator}(${var.failed_c2d_twin_read_rate_timeframe}):( + default(avg:azure.devices_iothubs.c2d.twin.read.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.c2d.twin.read.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.c2d.twin.read.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) ) + ) * 100 > ${var.failed_c2d_twin_read_rate_threshold_critical} EOF type = "metric alert" @@ -240,12 +235,11 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { message = "${coalesce(var.failed_c2d_twin_update_rate_message, var.message)}" query = < ${var.failed_c2d_twin_update_rate_threshold_critical} + ${var.failed_c2d_twin_update_rate_time_aggregator}(${var.failed_c2d_twin_update_rate_timeframe}):( + default(avg:azure.devices_iothubs.c2d.twin.update.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.c2d.twin.update.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.c2d.twin.update.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) ) + ) * 100 > ${var.failed_c2d_twin_update_rate_threshold_critical} EOF type = "metric alert" @@ -276,12 +270,11 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { message = "${coalesce(var.failed_d2c_twin_read_rate_message, var.message)}" query = < ${var.failed_d2c_twin_read_rate_threshold_critical} + ${var.failed_d2c_twin_read_rate_time_aggregator}(${var.failed_d2c_twin_read_rate_timeframe}):( + default(avg:azure.devices_iothubs.d2c.twin.read.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.d2c.twin.read.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.twin.read.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) ) + ) * 100 > ${var.failed_d2c_twin_read_rate_threshold_critical} EOF type = "metric alert" @@ -312,12 +305,11 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { message = "${coalesce(var.failed_d2c_twin_update_rate_message, var.message)}" query = < ${var.failed_d2c_twin_update_rate_threshold_critical} + ${var.failed_d2c_twin_update_rate_time_aggregator}(${var.failed_d2c_twin_update_rate_timeframe}):( + default(avg:azure.devices_iothubs.d2c.twin.update.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.d2c.twin.update.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.twin.update.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) ) + ) * 100 > ${var.failed_d2c_twin_update_rate_threshold_critical} EOF type = "metric alert" @@ -348,14 +340,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { message = "${coalesce(var.dropped_d2c_telemetry_egress_message, var.message)}" query = < ${var.dropped_d2c_telemetry_egress_rate_threshold_critical} + ${var.dropped_d2c_telemetry_egress_time_aggregator}(${var.dropped_d2c_telemetry_egress_timeframe}): ( + default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) ) + ) * 100 > ${var.dropped_d2c_telemetry_egress_rate_threshold_critical} EOF type = "metric alert" @@ -386,14 +377,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { message = "${coalesce(var.orphaned_d2c_telemetry_egress_message, var.message)}" query = < ${var.orphaned_d2c_telemetry_egress_rate_threshold_critical} + ${var.orphaned_d2c_telemetry_egress_time_aggregator}(${var.orphaned_d2c_telemetry_egress_timeframe}): ( + default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) ) + ) * 100 > ${var.orphaned_d2c_telemetry_egress_rate_threshold_critical} EOF type = "metric alert" @@ -424,14 +414,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { message = "${coalesce(var.invalid_d2c_telemetry_egress_message, var.message)}" query = < ${var.invalid_d2c_telemetry_egress_rate_threshold_critical} + ${var.invalid_d2c_telemetry_egress_time_aggregator}(${var.invalid_d2c_telemetry_egress_timeframe}): ( + default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) ) + ) * 100 > ${var.invalid_d2c_telemetry_egress_rate_threshold_critical} EOF type = "metric alert" @@ -462,11 +451,10 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { message = "${coalesce(var.too_many_d2c_telemetry_ingress_nosent_message, var.message)}" query = < 0 + sum(${var.too_many_d2c_telemetry_ingress_nosent_timeframe}): ( + avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() - + avg:azure.devices_iothubs.d2c.telemetry.ingress.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() + ) > 0 EOF type = "metric alert"