MON-80 use only one message and add inputs descriptions

This commit is contained in:
Alexandre Gaillet 2017-10-31 14:25:24 +01:00
parent cf3309ce75
commit c1563c3318
3 changed files with 178 additions and 128 deletions

View File

@ -1,4 +1,4 @@
Azure Redis DataDog monitors Azure IOT Hubs DataDog monitors
============================ ============================
How to use this module How to use this module
@ -8,22 +8,8 @@ How to use this module
module "iothubs" { module "iothubs" {
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors" source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors"
jobs_failed_message = "${module.datadog-message-alerting.alerting-message}" message = "${module.datadog-message-alerting.alerting-message}"
listjobs_failed_message = "${module.datadog-message-alerting.alerting-message}"
queryjobs_failed_message = "${module.datadog-message-alerting.alerting-message}"
status_message = "${module.datadog-message-alerting.alerting-message}"
total_devices_message = "${module.datadog-message-alerting.alerting-message}"
c2d_methods_failed_message = "${module.datadog-message-alerting.alerting-message}"
c2d_twin_read_failed_message = "${module.datadog-message-alerting.alerting-message}"
c2d_twin_update_failed_message = "${module.datadog-message-alerting.alerting-message}"
d2c_twin_read_failed_message = "${module.datadog-message-alerting.alerting-message}"
d2c_twin_update_failed_message = "${module.datadog-message-alerting.alerting-message}"
d2c_telemetry_egress_dropped_message = "${module.datadog-message-alerting.alerting-message}"
d2c_telemetry_egress_orphaned_message = "${module.datadog-message-alerting.alerting-message}"
d2c_telemetry_egress_invalid_message = "${module.datadog-message-alerting.alerting-message}"
d2c_telemetry_egress_fallback_message = "${module.datadog-message-alerting.alerting-message}"
d2c_telemetry_ingress_nosent_message = "${module.datadog-message-alerting.alerting-message}"
environment = "${var.environment}" environment = "${var.environment}"
stack = "${var.stack}" stack = "${var.stack}"
client_name = "${var.client_name}" client_name = "${var.client_name}"
@ -56,54 +42,39 @@ Inputs
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| c2d_methods_failed_message | | string | - | yes | | c2d_methods_failed_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no |
| c2d_methods_failed_threshold_critical | | string | `10` | no | | c2d_methods_failed_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no |
| c2d_methods_failed_threshold_warning | | string | `0` | no | | c2d_twin_read_failed_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no |
| c2d_twin_read_failed_message | | string | - | yes | | c2d_twin_read_failed_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no |
| c2d_twin_read_failed_threshold_critical | | string | `10` | no | | c2d_twin_update_failed_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no |
| c2d_twin_read_failed_threshold_warning | | string | `0` | no | | c2d_twin_update_failed_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no |
| c2d_twin_update_failed_message | | string | - | yes | | client_name | Client Name | string | - | yes |
| c2d_twin_update_failed_threshold_critical | | string | `10` | no | | d2c_telemetry_egress_dropped_threshold_critical | D2C Telemetry Dropped Failed limit (critical threshold) | string | `1000` | no |
| c2d_twin_update_failed_threshold_warning | | string | `0` | no | | d2c_telemetry_egress_dropped_threshold_warning | D2C Telemetry Dropped Failed limit (warning threshold) | string | `500` | no |
| client_name | | string | - | yes | | d2c_telemetry_egress_fallback_threshold_critical | D2C Telemetry Fallback Failed limit (critical threshold) | string | `1000` | no |
| d2c_telemetry_egress_dropped_message | | string | - | yes | | d2c_telemetry_egress_fallback_threshold_warning | D2C Telemetry Fallback Failed limit (warning threshold) | string | `500` | no |
| d2c_telemetry_egress_dropped_threshold_critical | | string | `1000` | no | | d2c_telemetry_egress_invalid_threshold_critical | D2C Telemetry Invalid Failed limit (critical threshold) | string | `1000` | no |
| d2c_telemetry_egress_dropped_threshold_warning | | string | `500` | no | | d2c_telemetry_egress_invalid_threshold_warning | D2C Telemetry Invalid Failed limit (warning threshold) | string | `500` | no |
| d2c_telemetry_egress_fallback_message | | string | - | yes | | d2c_telemetry_egress_orphaned_threshold_critical | D2C Telemetry Orphaned Failed limit (critical threshold) | string | `1000` | no |
| d2c_telemetry_egress_fallback_threshold_critical | | string | `1000` | no | | d2c_telemetry_egress_orphaned_threshold_warning | D2C Telemetry Orphaned Failed limit (warning threshold) | string | `500` | no |
| d2c_telemetry_egress_fallback_threshold_warning | | string | `500` | no | | d2c_twin_read_failed_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no |
| d2c_telemetry_egress_invalid_message | | string | - | yes | | d2c_twin_read_failed_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no |
| d2c_telemetry_egress_invalid_threshold_critical | | string | `1000` | no | | d2c_twin_update_failed_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no |
| d2c_telemetry_egress_invalid_threshold_warning | | string | `500` | no | | d2c_twin_update_failed_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no |
| d2c_telemetry_egress_orphaned_message | | string | - | yes | | delay | Delay in seconds for the metric evaluation | string | `600` | no |
| d2c_telemetry_egress_orphaned_threshold_critical | | string | `1000` | no | | environment | Architecture Environment | string | - | yes |
| d2c_telemetry_egress_orphaned_threshold_warning | | string | `500` | no | | jobs_failed_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no |
| d2c_telemetry_ingress_nosent_message | | string | - | yes | | jobs_failed_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no |
| d2c_twin_read_failed_message | | string | - | yes | | listjobs_failed_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no |
| d2c_twin_read_failed_threshold_critical | | string | `10` | no | | listjobs_failed_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no |
| d2c_twin_read_failed_threshold_warning | | string | `0` | no | | message | Message sent when an alert is triggered | string | - | yes |
| d2c_twin_update_failed_message | | string | - | yes | | queryjobs_failed_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no |
| d2c_twin_update_failed_threshold_critical | | string | `10` | no | | queryjobs_failed_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no |
| d2c_twin_update_failed_threshold_warning | | string | `0` | no | | subscription_id | Subscription ID used to tag monitors | string | - | yes |
| delay | | string | `600` | no |
| environment | | string | - | yes |
| jobs_failed_message | | string | - | yes |
| jobs_failed_threshold_critical | | string | `10` | no |
| jobs_failed_threshold_warning | # IOT hubs | string | `0` | no |
| listjobs_failed_message | | string | - | yes |
| listjobs_failed_threshold_critical | | string | `10` | no |
| listjobs_failed_threshold_warning | | string | `0` | no |
| queryjobs_failed_message | | string | - | yes |
| queryjobs_failed_threshold_critical | | string | `10` | no |
| queryjobs_failed_threshold_warning | | string | `0` | no |
| stack | | string | - | yes |
| status_message | | string | - | yes |
| subscription_id | | string | - | yes |
| total_devices_message | | string | - | yes |
Related documentation Related documentation
--------------------- ---------------------
DataDog documentation: https://docs.datadoghq.com/integrations/azure_iot_hub/ DataDog documentation: [https://docs.datadoghq.com/integrations/azure_iot_hub](https://docs.datadoghq.com/integrations/azure_iot_hub)
Azure IOT Hubs metrics documentation: https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health Azure IOT Hubs metrics documentation: [https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health](https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health)

View File

@ -1,138 +1,144 @@
variable "environment" {} variable "environment" {
description = "Architecture Environment"
type = "string"
}
variable "stack" {} variable "client_name" {
description = "Client Name"
type = "string"
}
variable "client_name" {} variable "subscription_id" {
description = "Subscription ID used to tag monitors"
variable "subscription_id" {} type = "string"
}
variable "delay" { variable "delay" {
description = "Delay in seconds for the metric evaluation"
default = 600 default = 600
} }
variable "message" {
description = "Message sent when an alert is triggered"
}
## IOT hubs ## IOT hubs
variable "jobs_failed_threshold_warning" { variable "jobs_failed_threshold_warning" {
description = "Jobs Failed rate limit (warning threshold)"
default = 0 default = 0
} }
variable "jobs_failed_threshold_critical" { variable "jobs_failed_threshold_critical" {
description = "Jobs Failed rate limit (critical threshold)"
default = 10 default = 10
} }
variable "jobs_failed_message" {}
variable "listjobs_failed_threshold_warning" { variable "listjobs_failed_threshold_warning" {
description = "ListJobs Failed rate limit (warning threshold)"
default = 0 default = 0
} }
variable "listjobs_failed_threshold_critical" { variable "listjobs_failed_threshold_critical" {
description = "ListJobs Failed rate limit (critical threshold)"
default = 10 default = 10
} }
variable "listjobs_failed_message" {}
variable "queryjobs_failed_threshold_warning" { variable "queryjobs_failed_threshold_warning" {
description = "QueryJobs Failed rate limit (warning threshold)"
default = 0 default = 0
} }
variable "queryjobs_failed_threshold_critical" { variable "queryjobs_failed_threshold_critical" {
description = "QueryJobs Failed rate limit (critical threshold)"
default = 10 default = 10
} }
variable "queryjobs_failed_message" {}
variable "status_message" {}
variable "total_devices_message" {}
variable "c2d_methods_failed_threshold_warning" { variable "c2d_methods_failed_threshold_warning" {
description = "C2D Methods Failed rate limit (warning threshold)"
default = 0 default = 0
} }
variable "c2d_methods_failed_threshold_critical" { variable "c2d_methods_failed_threshold_critical" {
description = "C2D Methods Failed rate limit (critical threshold)"
default = 10 default = 10
} }
variable "c2d_methods_failed_message" {}
variable "c2d_twin_read_failed_threshold_warning" { variable "c2d_twin_read_failed_threshold_warning" {
description = "C2D Twin Read Failed rate limit (warning threshold)"
default = 0 default = 0
} }
variable "c2d_twin_read_failed_threshold_critical" { variable "c2d_twin_read_failed_threshold_critical" {
description = "C2D Twin Read Failed rate limit (critical threshold)"
default = 10 default = 10
} }
variable "c2d_twin_read_failed_message" {}
variable "c2d_twin_update_failed_threshold_warning" { variable "c2d_twin_update_failed_threshold_warning" {
description = "C2D Twin Update Failed rate limit (warning threshold)"
default = 0 default = 0
} }
variable "c2d_twin_update_failed_threshold_critical" { variable "c2d_twin_update_failed_threshold_critical" {
description = "C2D Twin Update Failed rate limit (critical threshold)"
default = 10 default = 10
} }
variable "c2d_twin_update_failed_message" {}
variable "d2c_twin_read_failed_threshold_warning" { variable "d2c_twin_read_failed_threshold_warning" {
description = "D2C Twin Read Failed rate limit (warning threshold)"
default = 0 default = 0
} }
variable "d2c_twin_read_failed_threshold_critical" { variable "d2c_twin_read_failed_threshold_critical" {
description = "D2C Twin Read Failed rate limit (critical threshold)"
default = 10 default = 10
} }
variable "d2c_twin_read_failed_message" {}
variable "d2c_twin_update_failed_threshold_warning" { variable "d2c_twin_update_failed_threshold_warning" {
description = "D2C Twin Update Failed rate limit (warning threshold)"
default = 0 default = 0
} }
variable "d2c_twin_update_failed_threshold_critical" { variable "d2c_twin_update_failed_threshold_critical" {
description = "D2C Twin Update Failed rate limit (critical threshold)"
default = 10 default = 10
} }
variable "d2c_twin_update_failed_message" {}
variable "d2c_telemetry_egress_dropped_threshold_warning" { variable "d2c_telemetry_egress_dropped_threshold_warning" {
description = "D2C Telemetry Dropped Failed limit (warning threshold)"
default = 500 default = 500
} }
variable "d2c_telemetry_egress_dropped_threshold_critical" { variable "d2c_telemetry_egress_dropped_threshold_critical" {
description = "D2C Telemetry Dropped Failed limit (critical threshold)"
default = 1000 default = 1000
} }
variable "d2c_telemetry_egress_dropped_message" {}
variable "d2c_telemetry_egress_orphaned_threshold_warning" { variable "d2c_telemetry_egress_orphaned_threshold_warning" {
description = "D2C Telemetry Orphaned Failed limit (warning threshold)"
default = 500 default = 500
} }
variable "d2c_telemetry_egress_orphaned_threshold_critical" { variable "d2c_telemetry_egress_orphaned_threshold_critical" {
description = "D2C Telemetry Orphaned Failed limit (critical threshold)"
default = 1000 default = 1000
} }
variable "d2c_telemetry_egress_orphaned_message" {}
variable "d2c_telemetry_egress_invalid_threshold_warning" { variable "d2c_telemetry_egress_invalid_threshold_warning" {
description = "D2C Telemetry Invalid Failed limit (warning threshold)"
default = 500 default = 500
} }
variable "d2c_telemetry_egress_invalid_threshold_critical" { variable "d2c_telemetry_egress_invalid_threshold_critical" {
description = "D2C Telemetry Invalid Failed limit (critical threshold)"
default = 1000 default = 1000
} }
variable "d2c_telemetry_egress_invalid_message" {}
variable "d2c_telemetry_egress_fallback_threshold_warning" { variable "d2c_telemetry_egress_fallback_threshold_warning" {
description = "D2C Telemetry Fallback Failed limit (warning threshold)"
default = 500 default = 500
} }
variable "d2c_telemetry_egress_fallback_threshold_critical" { variable "d2c_telemetry_egress_fallback_threshold_critical" {
description = "D2C Telemetry Fallback Failed limit (critical threshold)"
default = 1000 default = 1000
} }
variable "d2c_telemetry_egress_fallback_message" {}
variable "d2c_telemetry_ingress_nosent_message" {}

View File

@ -1,8 +1,14 @@
resource "datadog_monitor" "too_many_jobs_failed" { resource "datadog_monitor" "too_many_jobs_failed" {
name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}" name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}"
message = "${var.jobs_failed_message}" message = "${var.message}"
query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}" query = <<EOF
avg(last_5m):(
avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
) * 100 > ${var.jobs_failed_threshold_critical}
EOF
type = "query alert" type = "query alert"
thresholds { thresholds {
@ -24,9 +30,15 @@ resource "datadog_monitor" "too_many_jobs_failed" {
resource "datadog_monitor" "too_many_list_jobs_failed" { resource "datadog_monitor" "too_many_list_jobs_failed" {
name = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}" name = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}"
message = "${var.listjobs_failed_message}" message = "${var.message}"
query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}" query = <<EOF
avg(last_5m):(
avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() /
( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() +
avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() )
) * 100 > ${var.listjobs_failed_threshold_critical}
EOF
type = "query alert" type = "query alert"
thresholds { thresholds {
@ -48,9 +60,15 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
resource "datadog_monitor" "too_many_query_jobs_failed" { resource "datadog_monitor" "too_many_query_jobs_failed" {
name = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}" name = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}"
message = "${var.queryjobs_failed_message}" message = "${var.message}"
query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}" query = <<EOF
avg(last_5m):(
avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() /
( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() +
avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() )
) * 100 > ${var.queryjobs_failed_threshold_critical}
EOF
type = "query alert" type = "query alert"
thresholds { thresholds {
@ -72,9 +90,11 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
resource "datadog_monitor" "status" { resource "datadog_monitor" "status" {
name = "[${var.environment}] IOT Hub Status is not ok on {{name}}" name = "[${var.environment}] IOT Hub Status is not ok on {{name}}"
message = "${var.status_message}" message = "${var.message}"
query = "avg(last_5m):avg:azure.devices_iothubs.status{subscription_id:${var.subscription_id}} by {name,resource_group} < 1" query = <<EOF
avg(last_5m):avg:azure.devices_iothubs.status{subscription_id:${var.subscription_id}} by {name,resource_group} < 1
EOF
type = "query alert" type = "query alert"
notify_no_data = true notify_no_data = true
@ -91,9 +111,11 @@ resource "datadog_monitor" "status" {
resource "datadog_monitor" "total_devices" { resource "datadog_monitor" "total_devices" {
name = "[${var.environment}] IOT Hub Total devices is wrong on {{name}}" name = "[${var.environment}] IOT Hub Total devices is wrong on {{name}}"
message = "${var.total_devices_message}" message = "${var.message}"
query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{subscription_id:${var.subscription_id}} by {name,resource_group} == 0" query = <<EOF
avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{subscription_id:${var.subscription_id}} by {name,resource_group} == 0
EOF
type = "query alert" type = "query alert"
notify_no_data = true notify_no_data = true
@ -110,9 +132,15 @@ resource "datadog_monitor" "total_devices" {
resource "datadog_monitor" "too_many_c2d_methods_failed" { resource "datadog_monitor" "too_many_c2d_methods_failed" {
name = "[${var.environment}] IOT Hub Too many c2d methods failure on {{name}}" name = "[${var.environment}] IOT Hub Too many c2d methods failure on {{name}}"
message = "${var.c2d_methods_failed_message}" message = "${var.message}"
query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}" query = <<EOF
avg(last_5m):(
avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
avg:azure.devices_iothubs.c2d.methods.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
) * 100 > ${var.c2d_methods_failed_threshold_critical}
EOF
type = "query alert" type = "query alert"
thresholds { thresholds {
@ -134,9 +162,15 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
resource "datadog_monitor" "too_many_c2d_twin_read_failed" { resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
name = "[${var.environment}] IOT Hub Too many c2d twin read failure on {{name}}" name = "[${var.environment}] IOT Hub Too many c2d twin read failure on {{name}}"
message = "${var.c2d_twin_read_failed_message}" message = "${var.message}"
query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}" query = <<EOF
avg(last_5m):(
avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
avg:azure.devices_iothubs.c2d.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
) * 100 > ${var.c2d_twin_read_failed_threshold_critical}
EOF
type = "query alert" type = "query alert"
thresholds { thresholds {
@ -158,9 +192,15 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
resource "datadog_monitor" "too_many_c2d_twin_update_failed" { resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
name = "[${var.environment}] IOT Hub Too many c2d twin update failure on {{name}}" name = "[${var.environment}] IOT Hub Too many c2d twin update failure on {{name}}"
message = "${var.c2d_twin_update_failed_message}" message = "${var.message}"
query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}" query = <<EOF
avg(last_5m):(
avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
avg:azure.devices_iothubs.c2d.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
) * 100 > ${var.c2d_twin_update_failed_threshold_critical}
EOF
type = "query alert" type = "query alert"
thresholds { thresholds {
@ -182,9 +222,15 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
resource "datadog_monitor" "too_many_d2c_twin_read_failed" { resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
name = "[${var.environment}] IOT Hub Too many d2c twin read failure on {{name}}" name = "[${var.environment}] IOT Hub Too many d2c twin read failure on {{name}}"
message = "${var.d2c_twin_read_failed_message}" message = "${var.message}"
query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}" query = <<EOF
avg(last_5m):(
avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
avg:azure.devices_iothubs.d2c.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
) * 100 > ${var.d2c_twin_read_failed_threshold_critical}
EOF
type = "query alert" type = "query alert"
thresholds { thresholds {
@ -206,9 +252,15 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
resource "datadog_monitor" "too_many_d2c_twin_update_failed" { resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
name = "[${var.environment}] IOT Hub Too many d2c twin update failure on {{name}}" name = "[${var.environment}] IOT Hub Too many d2c twin update failure on {{name}}"
message = "${var.d2c_twin_update_failed_message}" message = "${var.message}"
query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}" query = <<EOF
avg(last_5m):(
avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
avg:azure.devices_iothubs.d2c.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
) * 100 > ${var.d2c_twin_update_failed_threshold_critical}
EOF
type = "query alert" type = "query alert"
thresholds { thresholds {
@ -230,9 +282,13 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped on {{name}}" name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped on {{name}}"
message = "${var.d2c_telemetry_egress_dropped_message}" message = "${var.message}"
query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_dropped_threshold_critical}" query = <<EOF
sum(last_5m): (
avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
) > ${var.d2c_telemetry_egress_dropped_threshold_critical}
EOF
type = "query alert" type = "query alert"
thresholds { thresholds {
@ -254,9 +310,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned on {{name}}" name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned on {{name}}"
message = "${var.d2c_telemetry_egress_orphaned_message}" message = "${var.message}"
query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_orphaned_threshold_critical}" query = <<EOF
sum(last_5m): (
avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
) > ${var.d2c_telemetry_egress_orphaned_threshold_critical}
EOF
type = "query alert" type = "query alert"
thresholds { thresholds {
@ -278,9 +338,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid on {{name}}" name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid on {{name}}"
message = "${var.d2c_telemetry_egress_invalid_message}" message = "${var.message}"
query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_invalid_threshold_critical}" query = <<EOF
sum(last_5m): (
avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
) > ${var.d2c_telemetry_egress_invalid_threshold_critical}
EOF
type = "query alert" type = "query alert"
thresholds { thresholds {
@ -302,9 +366,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" { resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress fallback on {{name}}" name = "[${var.environment}] IOT Hub Too many d2c telemetry egress fallback on {{name}}"
message = "${var.d2c_telemetry_egress_fallback_message}" message = "${var.message}"
query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_fallback_threshold_critical}" query = <<EOF
sum(last_5m): (
avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
) > ${var.d2c_telemetry_egress_fallback_threshold_critical}
EOF
type = "query alert" type = "query alert"
thresholds { thresholds {
@ -326,9 +394,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress no sent on {{name}}" name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress no sent on {{name}}"
message = "${var.d2c_telemetry_ingress_nosent_message}" message = "${var.message}"
query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() - avg:azure.devices_iothubs.d2c.telemetry.ingress.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() > 0" query = <<EOF
sum(last_5m): (
avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() -
avg:azure.devices_iothubs.d2c.telemetry.ingress.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
) > 0
EOF
type = "query alert" type = "query alert"
notify_no_data = false notify_no_data = false