diff --git a/cloud/azure/iothubs/README.md b/cloud/azure/iothubs/README.md new file mode 100644 index 0000000..5187715 --- /dev/null +++ b/cloud/azure/iothubs/README.md @@ -0,0 +1,76 @@ +Azure IOT Hubs DataDog monitors +=============================== + +How to use this module +---------------------- + +``` +module "iothubs" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* Service status check +* Jobs failed average check +* Query Jobs failed average check +* List Jobs failed average check +* Total devices count check +* C2D methods failed average check +* C2D twin read failed average check +* C2D twin update failed average check +* D2C twin read failed average check +* D2C twin update failed average check +* D2C telemetry egress dropped count check +* D2C telemetry egress orphaned count check +* D2C telemetry egress invalid count check +* D2C telemetry egress fallback count check +* D2C telemetry ingress no sent count check + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| delay | Delay in seconds for the metric evaluation | string | `600` | no | +| dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no | +| dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no | +| environment | Architecture Environment | string | - | yes | +| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no | +| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no | +| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no | +| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no | +| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no | +| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no | +| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no | +| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no | +| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no | +| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no | +| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no | +| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no | +| fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no | +| fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no | +| invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no | +| message | Message sent when an alert is triggered | string | - | yes | +| orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no | +| orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/azure_iot_hub](https://docs.datadoghq.com/integrations/azure_iot_hub) + +Azure IOT Hubs metrics documentation: [https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health](https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health) diff --git a/cloud/azure/iothubs/inputs.tf b/cloud/azure/iothubs/inputs.tf new file mode 100644 index 0000000..1b1348f --- /dev/null +++ b/cloud/azure/iothubs/inputs.tf @@ -0,0 +1,146 @@ +# Global Terraform +variable "environment" { + description = "Architecture Environment" + type = "string" +} + +# Global DataDog +variable "delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "message" { + description = "Message sent when an alert is triggered" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# Azure IOT hubs specific +variable "failed_jobs_rate_threshold_warning" { + description = "Jobs Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_jobs_rate_threshold_critical" { + description = "Jobs Failed rate limit (critical threshold)" + default = 10 +} + +variable "failed_listjobs_rate_threshold_warning" { + description = "ListJobs Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_listjobs_rate_threshold_critical" { + description = "ListJobs Failed rate limit (critical threshold)" + default = 10 +} + +variable "failed_queryjobs_rate_threshold_warning" { + description = "QueryJobs Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_queryjobs_rate_threshold_critical" { + description = "QueryJobs Failed rate limit (critical threshold)" + default = 10 +} + +variable "failed_c2d_methods_rate_threshold_warning" { + description = "C2D Methods Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_c2d_methods_rate_threshold_critical" { + description = "C2D Methods Failed rate limit (critical threshold)" + default = 10 +} + +variable "failed_c2d_twin_read_rate_threshold_warning" { + description = "C2D Twin Read Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_c2d_twin_read_rate_threshold_critical" { + description = "C2D Twin Read Failed rate limit (critical threshold)" + default = 10 +} + +variable "failed_c2d_twin_update_rate_threshold_warning" { + description = "C2D Twin Update Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_c2d_twin_update_rate_threshold_critical" { + description = "C2D Twin Update Failed rate limit (critical threshold)" + default = 10 +} + +variable "failed_d2c_twin_read_rate_threshold_warning" { + description = "D2C Twin Read Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_d2c_twin_read_rate_threshold_critical" { + description = "D2C Twin Read Failed rate limit (critical threshold)" + default = 10 +} + +variable "failed_d2c_twin_update_rate_threshold_warning" { + description = "D2C Twin Update Failed rate limit (warning threshold)" + default = 0 +} + +variable "failed_d2c_twin_update_rate_threshold_critical" { + description = "D2C Twin Update Failed rate limit (critical threshold)" + default = 10 +} + +variable "dropped_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Dropped limit (warning threshold)" + default = 500 +} + +variable "dropped_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Dropped limit (critical threshold)" + default = 1000 +} + +variable "orphaned_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Orphaned limit (warning threshold)" + default = 500 +} + +variable "orphaned_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Orphaned limit (critical threshold)" + default = 1000 +} + +variable "invalid_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Invalid limit (warning threshold)" + default = 500 +} + +variable "invalid_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Invalid limit (critical threshold)" + default = 1000 +} + +variable "fallback_d2c_telemetry_egress_threshold_warning" { + description = "D2C Telemetry Fallback limit (warning threshold)" + default = 500 +} + +variable "fallback_d2c_telemetry_egress_threshold_critical" { + description = "D2C Telemetry Fallback limit (critical threshold)" + default = 1000 +} diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf new file mode 100644 index 0000000..6e1f926 --- /dev/null +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -0,0 +1,470 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_azure_iothub:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +resource "datadog_monitor" "too_many_jobs_failed" { + name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}" + message = "${var.message}" + + query = < ${var.failed_jobs_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_jobs_rate_threshold_warning}" + critical = "${var.failed_jobs_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_list_jobs_failed" { + name = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}" + message = "${var.message}" + + query = < ${var.failed_listjobs_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_listjobs_rate_threshold_warning}" + critical = "${var.failed_listjobs_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_query_jobs_failed" { + name = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}" + message = "${var.message}" + + query = < ${var.failed_queryjobs_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_queryjobs_rate_threshold_warning}" + critical = "${var.failed_queryjobs_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "status" { + name = "[${var.environment}] IOT Hub Status is not ok on {{name}}" + message = "${var.message}" + + query = < ${var.failed_c2d_methods_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_c2d_methods_rate_threshold_warning}" + critical = "${var.failed_c2d_methods_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_c2d_twin_read_failed" { + name = "[${var.environment}] IOT Hub Too many c2d twin read failure on {{name}}" + message = "${var.message}" + + query = < ${var.failed_c2d_twin_read_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_c2d_twin_read_rate_threshold_warning}" + critical = "${var.failed_c2d_twin_read_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_c2d_twin_update_failed" { + name = "[${var.environment}] IOT Hub Too many c2d twin update failure on {{name}}" + message = "${var.message}" + + query = < ${var.failed_c2d_twin_update_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_c2d_twin_update_rate_threshold_warning}" + critical = "${var.failed_c2d_twin_update_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_d2c_twin_read_failed" { + name = "[${var.environment}] IOT Hub Too many d2c twin read failure on {{name}}" + message = "${var.message}" + + query = < ${var.failed_d2c_twin_read_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_d2c_twin_read_rate_threshold_warning}" + critical = "${var.failed_d2c_twin_read_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_d2c_twin_update_failed" { + name = "[${var.environment}] IOT Hub Too many d2c twin update failure on {{name}}" + message = "${var.message}" + + query = < ${var.failed_d2c_twin_update_rate_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.failed_d2c_twin_update_rate_threshold_warning}" + critical = "${var.failed_d2c_twin_update_rate_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped on {{name}}" + message = "${var.message}" + + query = < ${var.dropped_d2c_telemetry_egress_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.dropped_d2c_telemetry_egress_threshold_warning}" + critical = "${var.dropped_d2c_telemetry_egress_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned on {{name}}" + message = "${var.message}" + + query = < ${var.orphaned_d2c_telemetry_egress_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.orphaned_d2c_telemetry_egress_threshold_warning}" + critical = "${var.orphaned_d2c_telemetry_egress_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid on {{name}}" + message = "${var.message}" + + query = < ${var.invalid_d2c_telemetry_egress_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.invalid_d2c_telemetry_egress_threshold_warning}" + critical = "${var.invalid_d2c_telemetry_egress_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry egress fallback on {{name}}" + message = "${var.message}" + + query = < ${var.fallback_d2c_telemetry_egress_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + warning = "${var.fallback_d2c_telemetry_egress_threshold_warning}" + critical = "${var.fallback_d2c_telemetry_egress_threshold_critical}" + } + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +} + +resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { + name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress no sent on {{name}}" + message = "${var.message}" + + query = < 0 + EOF + + type = "metric alert" + + notify_no_data = false + evaluation_delay = "${var.delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:iothub", "team:azure", "provider:azure"] +}