MON-80 use only one message and add inputs descriptions
This commit is contained in:
parent
cf3309ce75
commit
c1563c3318
@ -1,4 +1,4 @@
|
||||
Azure Redis DataDog monitors
|
||||
Azure IOT Hubs DataDog monitors
|
||||
============================
|
||||
|
||||
How to use this module
|
||||
@ -8,22 +8,8 @@ How to use this module
|
||||
module "iothubs" {
|
||||
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/azure/iothubs?ref=MON-80-azure-hub-iot-monitors"
|
||||
|
||||
jobs_failed_message = "${module.datadog-message-alerting.alerting-message}"
|
||||
listjobs_failed_message = "${module.datadog-message-alerting.alerting-message}"
|
||||
queryjobs_failed_message = "${module.datadog-message-alerting.alerting-message}"
|
||||
status_message = "${module.datadog-message-alerting.alerting-message}"
|
||||
total_devices_message = "${module.datadog-message-alerting.alerting-message}"
|
||||
c2d_methods_failed_message = "${module.datadog-message-alerting.alerting-message}"
|
||||
c2d_twin_read_failed_message = "${module.datadog-message-alerting.alerting-message}"
|
||||
c2d_twin_update_failed_message = "${module.datadog-message-alerting.alerting-message}"
|
||||
d2c_twin_read_failed_message = "${module.datadog-message-alerting.alerting-message}"
|
||||
d2c_twin_update_failed_message = "${module.datadog-message-alerting.alerting-message}"
|
||||
d2c_telemetry_egress_dropped_message = "${module.datadog-message-alerting.alerting-message}"
|
||||
d2c_telemetry_egress_orphaned_message = "${module.datadog-message-alerting.alerting-message}"
|
||||
d2c_telemetry_egress_invalid_message = "${module.datadog-message-alerting.alerting-message}"
|
||||
d2c_telemetry_egress_fallback_message = "${module.datadog-message-alerting.alerting-message}"
|
||||
d2c_telemetry_ingress_nosent_message = "${module.datadog-message-alerting.alerting-message}"
|
||||
|
||||
message = "${module.datadog-message-alerting.alerting-message}"
|
||||
|
||||
environment = "${var.environment}"
|
||||
stack = "${var.stack}"
|
||||
client_name = "${var.client_name}"
|
||||
@ -56,54 +42,39 @@ Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| c2d_methods_failed_message | | string | - | yes |
|
||||
| c2d_methods_failed_threshold_critical | | string | `10` | no |
|
||||
| c2d_methods_failed_threshold_warning | | string | `0` | no |
|
||||
| c2d_twin_read_failed_message | | string | - | yes |
|
||||
| c2d_twin_read_failed_threshold_critical | | string | `10` | no |
|
||||
| c2d_twin_read_failed_threshold_warning | | string | `0` | no |
|
||||
| c2d_twin_update_failed_message | | string | - | yes |
|
||||
| c2d_twin_update_failed_threshold_critical | | string | `10` | no |
|
||||
| c2d_twin_update_failed_threshold_warning | | string | `0` | no |
|
||||
| client_name | | string | - | yes |
|
||||
| d2c_telemetry_egress_dropped_message | | string | - | yes |
|
||||
| d2c_telemetry_egress_dropped_threshold_critical | | string | `1000` | no |
|
||||
| d2c_telemetry_egress_dropped_threshold_warning | | string | `500` | no |
|
||||
| d2c_telemetry_egress_fallback_message | | string | - | yes |
|
||||
| d2c_telemetry_egress_fallback_threshold_critical | | string | `1000` | no |
|
||||
| d2c_telemetry_egress_fallback_threshold_warning | | string | `500` | no |
|
||||
| d2c_telemetry_egress_invalid_message | | string | - | yes |
|
||||
| d2c_telemetry_egress_invalid_threshold_critical | | string | `1000` | no |
|
||||
| d2c_telemetry_egress_invalid_threshold_warning | | string | `500` | no |
|
||||
| d2c_telemetry_egress_orphaned_message | | string | - | yes |
|
||||
| d2c_telemetry_egress_orphaned_threshold_critical | | string | `1000` | no |
|
||||
| d2c_telemetry_egress_orphaned_threshold_warning | | string | `500` | no |
|
||||
| d2c_telemetry_ingress_nosent_message | | string | - | yes |
|
||||
| d2c_twin_read_failed_message | | string | - | yes |
|
||||
| d2c_twin_read_failed_threshold_critical | | string | `10` | no |
|
||||
| d2c_twin_read_failed_threshold_warning | | string | `0` | no |
|
||||
| d2c_twin_update_failed_message | | string | - | yes |
|
||||
| d2c_twin_update_failed_threshold_critical | | string | `10` | no |
|
||||
| d2c_twin_update_failed_threshold_warning | | string | `0` | no |
|
||||
| delay | | string | `600` | no |
|
||||
| environment | | string | - | yes |
|
||||
| jobs_failed_message | | string | - | yes |
|
||||
| jobs_failed_threshold_critical | | string | `10` | no |
|
||||
| jobs_failed_threshold_warning | # IOT hubs | string | `0` | no |
|
||||
| listjobs_failed_message | | string | - | yes |
|
||||
| listjobs_failed_threshold_critical | | string | `10` | no |
|
||||
| listjobs_failed_threshold_warning | | string | `0` | no |
|
||||
| queryjobs_failed_message | | string | - | yes |
|
||||
| queryjobs_failed_threshold_critical | | string | `10` | no |
|
||||
| queryjobs_failed_threshold_warning | | string | `0` | no |
|
||||
| stack | | string | - | yes |
|
||||
| status_message | | string | - | yes |
|
||||
| subscription_id | | string | - | yes |
|
||||
| total_devices_message | | string | - | yes |
|
||||
| c2d_methods_failed_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| c2d_methods_failed_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| c2d_twin_read_failed_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| c2d_twin_read_failed_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| c2d_twin_update_failed_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| c2d_twin_update_failed_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| client_name | Client Name | string | - | yes |
|
||||
| d2c_telemetry_egress_dropped_threshold_critical | D2C Telemetry Dropped Failed limit (critical threshold) | string | `1000` | no |
|
||||
| d2c_telemetry_egress_dropped_threshold_warning | D2C Telemetry Dropped Failed limit (warning threshold) | string | `500` | no |
|
||||
| d2c_telemetry_egress_fallback_threshold_critical | D2C Telemetry Fallback Failed limit (critical threshold) | string | `1000` | no |
|
||||
| d2c_telemetry_egress_fallback_threshold_warning | D2C Telemetry Fallback Failed limit (warning threshold) | string | `500` | no |
|
||||
| d2c_telemetry_egress_invalid_threshold_critical | D2C Telemetry Invalid Failed limit (critical threshold) | string | `1000` | no |
|
||||
| d2c_telemetry_egress_invalid_threshold_warning | D2C Telemetry Invalid Failed limit (warning threshold) | string | `500` | no |
|
||||
| d2c_telemetry_egress_orphaned_threshold_critical | D2C Telemetry Orphaned Failed limit (critical threshold) | string | `1000` | no |
|
||||
| d2c_telemetry_egress_orphaned_threshold_warning | D2C Telemetry Orphaned Failed limit (warning threshold) | string | `500` | no |
|
||||
| d2c_twin_read_failed_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| d2c_twin_read_failed_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| d2c_twin_update_failed_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| d2c_twin_update_failed_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| environment | Architecture Environment | string | - | yes |
|
||||
| jobs_failed_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| jobs_failed_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| listjobs_failed_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| listjobs_failed_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| message | Message sent when an alert is triggered | string | - | yes |
|
||||
| queryjobs_failed_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| queryjobs_failed_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| subscription_id | Subscription ID used to tag monitors | string | - | yes |
|
||||
|
||||
Related documentation
|
||||
---------------------
|
||||
|
||||
DataDog documentation: https://docs.datadoghq.com/integrations/azure_iot_hub/
|
||||
DataDog documentation: [https://docs.datadoghq.com/integrations/azure_iot_hub](https://docs.datadoghq.com/integrations/azure_iot_hub)
|
||||
|
||||
Azure IOT Hubs metrics documentation: https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health
|
||||
Azure IOT Hubs metrics documentation: [https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health](https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health)
|
||||
@ -1,138 +1,144 @@
|
||||
variable "environment" {}
|
||||
variable "environment" {
|
||||
description = "Architecture Environment"
|
||||
type = "string"
|
||||
}
|
||||
|
||||
variable "stack" {}
|
||||
variable "client_name" {
|
||||
description = "Client Name"
|
||||
type = "string"
|
||||
}
|
||||
|
||||
variable "client_name" {}
|
||||
|
||||
variable "subscription_id" {}
|
||||
variable "subscription_id" {
|
||||
description = "Subscription ID used to tag monitors"
|
||||
type = "string"
|
||||
}
|
||||
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
description = "Message sent when an alert is triggered"
|
||||
}
|
||||
|
||||
## IOT hubs
|
||||
variable "jobs_failed_threshold_warning" {
|
||||
description = "Jobs Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
}
|
||||
|
||||
variable "jobs_failed_threshold_critical" {
|
||||
description = "Jobs Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "jobs_failed_message" {}
|
||||
|
||||
variable "listjobs_failed_threshold_warning" {
|
||||
description = "ListJobs Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
}
|
||||
|
||||
variable "listjobs_failed_threshold_critical" {
|
||||
description = "ListJobs Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "listjobs_failed_message" {}
|
||||
|
||||
variable "queryjobs_failed_threshold_warning" {
|
||||
description = "QueryJobs Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
}
|
||||
|
||||
variable "queryjobs_failed_threshold_critical" {
|
||||
description = "QueryJobs Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "queryjobs_failed_message" {}
|
||||
|
||||
variable "status_message" {}
|
||||
|
||||
variable "total_devices_message" {}
|
||||
|
||||
variable "c2d_methods_failed_threshold_warning" {
|
||||
description = "C2D Methods Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
}
|
||||
|
||||
variable "c2d_methods_failed_threshold_critical" {
|
||||
description = "C2D Methods Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "c2d_methods_failed_message" {}
|
||||
|
||||
variable "c2d_twin_read_failed_threshold_warning" {
|
||||
description = "C2D Twin Read Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
}
|
||||
|
||||
variable "c2d_twin_read_failed_threshold_critical" {
|
||||
description = "C2D Twin Read Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "c2d_twin_read_failed_message" {}
|
||||
|
||||
variable "c2d_twin_update_failed_threshold_warning" {
|
||||
description = "C2D Twin Update Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
}
|
||||
|
||||
variable "c2d_twin_update_failed_threshold_critical" {
|
||||
description = "C2D Twin Update Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "c2d_twin_update_failed_message" {}
|
||||
|
||||
variable "d2c_twin_read_failed_threshold_warning" {
|
||||
description = "D2C Twin Read Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
}
|
||||
|
||||
variable "d2c_twin_read_failed_threshold_critical" {
|
||||
description = "D2C Twin Read Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "d2c_twin_read_failed_message" {}
|
||||
|
||||
variable "d2c_twin_update_failed_threshold_warning" {
|
||||
description = "D2C Twin Update Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
}
|
||||
|
||||
variable "d2c_twin_update_failed_threshold_critical" {
|
||||
description = "D2C Twin Update Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "d2c_twin_update_failed_message" {}
|
||||
|
||||
variable "d2c_telemetry_egress_dropped_threshold_warning" {
|
||||
description = "D2C Telemetry Dropped Failed limit (warning threshold)"
|
||||
default = 500
|
||||
}
|
||||
|
||||
variable "d2c_telemetry_egress_dropped_threshold_critical" {
|
||||
description = "D2C Telemetry Dropped Failed limit (critical threshold)"
|
||||
default = 1000
|
||||
}
|
||||
|
||||
variable "d2c_telemetry_egress_dropped_message" {}
|
||||
|
||||
variable "d2c_telemetry_egress_orphaned_threshold_warning" {
|
||||
description = "D2C Telemetry Orphaned Failed limit (warning threshold)"
|
||||
default = 500
|
||||
}
|
||||
|
||||
variable "d2c_telemetry_egress_orphaned_threshold_critical" {
|
||||
description = "D2C Telemetry Orphaned Failed limit (critical threshold)"
|
||||
default = 1000
|
||||
}
|
||||
|
||||
variable "d2c_telemetry_egress_orphaned_message" {}
|
||||
|
||||
variable "d2c_telemetry_egress_invalid_threshold_warning" {
|
||||
description = "D2C Telemetry Invalid Failed limit (warning threshold)"
|
||||
default = 500
|
||||
}
|
||||
|
||||
variable "d2c_telemetry_egress_invalid_threshold_critical" {
|
||||
description = "D2C Telemetry Invalid Failed limit (critical threshold)"
|
||||
default = 1000
|
||||
}
|
||||
|
||||
variable "d2c_telemetry_egress_invalid_message" {}
|
||||
|
||||
variable "d2c_telemetry_egress_fallback_threshold_warning" {
|
||||
description = "D2C Telemetry Fallback Failed limit (warning threshold)"
|
||||
default = 500
|
||||
}
|
||||
|
||||
variable "d2c_telemetry_egress_fallback_threshold_critical" {
|
||||
description = "D2C Telemetry Fallback Failed limit (critical threshold)"
|
||||
default = 1000
|
||||
}
|
||||
|
||||
variable "d2c_telemetry_egress_fallback_message" {}
|
||||
|
||||
variable "d2c_telemetry_ingress_nosent_message" {}
|
||||
|
||||
@ -1,8 +1,14 @@
|
||||
resource "datadog_monitor" "too_many_jobs_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}"
|
||||
message = "${var.jobs_failed_message}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.jobs_failed_threshold_critical}"
|
||||
query = <<EOF
|
||||
avg(last_5m):(
|
||||
avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
|
||||
( avg:azure.devices_iothubs.jobs.failed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
|
||||
avg:azure.devices_iothubs.jobs.completed{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
|
||||
) * 100 > ${var.jobs_failed_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
|
||||
thresholds {
|
||||
@ -24,9 +30,15 @@ resource "datadog_monitor" "too_many_jobs_failed" {
|
||||
|
||||
resource "datadog_monitor" "too_many_list_jobs_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}"
|
||||
message = "${var.listjobs_failed_message}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.listjobs_failed_threshold_critical}"
|
||||
query = <<EOF
|
||||
avg(last_5m):(
|
||||
avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() /
|
||||
( avg:azure.devices_iothubs.jobs.list_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() +
|
||||
avg:azure.devices_iothubs.jobs.list_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() )
|
||||
) * 100 > ${var.listjobs_failed_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
|
||||
thresholds {
|
||||
@ -48,9 +60,15 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
|
||||
|
||||
resource "datadog_monitor" "too_many_query_jobs_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}"
|
||||
message = "${var.queryjobs_failed_message}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = "avg(last_5m):( avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() / ( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() + avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() ) ) * 100 > ${var.queryjobs_failed_threshold_critical}"
|
||||
query = <<EOF
|
||||
avg(last_5m):(
|
||||
avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() /
|
||||
( avg:azure.devices_iothubs.jobs.query_jobs.success{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() +
|
||||
avg:azure.devices_iothubs.jobs.query_jobs.failure{subscription_id:${var.subscription_id}} by {resource_group,name}.as_count() )
|
||||
) * 100 > ${var.queryjobs_failed_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
|
||||
thresholds {
|
||||
@ -72,9 +90,11 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
|
||||
|
||||
resource "datadog_monitor" "status" {
|
||||
name = "[${var.environment}] IOT Hub Status is not ok on {{name}}"
|
||||
message = "${var.status_message}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = "avg(last_5m):avg:azure.devices_iothubs.status{subscription_id:${var.subscription_id}} by {name,resource_group} < 1"
|
||||
query = <<EOF
|
||||
avg(last_5m):avg:azure.devices_iothubs.status{subscription_id:${var.subscription_id}} by {name,resource_group} < 1
|
||||
EOF
|
||||
type = "query alert"
|
||||
|
||||
notify_no_data = true
|
||||
@ -91,9 +111,11 @@ resource "datadog_monitor" "status" {
|
||||
|
||||
resource "datadog_monitor" "total_devices" {
|
||||
name = "[${var.environment}] IOT Hub Total devices is wrong on {{name}}"
|
||||
message = "${var.total_devices_message}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = "avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{subscription_id:${var.subscription_id}} by {name,resource_group} == 0"
|
||||
query = <<EOF
|
||||
avg(last_5m):avg:azure.devices_iothubs.devices.total_devices{subscription_id:${var.subscription_id}} by {name,resource_group} == 0
|
||||
EOF
|
||||
type = "query alert"
|
||||
|
||||
notify_no_data = true
|
||||
@ -110,9 +132,15 @@ resource "datadog_monitor" "total_devices" {
|
||||
|
||||
resource "datadog_monitor" "too_many_c2d_methods_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many c2d methods failure on {{name}}"
|
||||
message = "${var.c2d_methods_failed_message}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.methods.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_methods_failed_threshold_critical}"
|
||||
query = <<EOF
|
||||
avg(last_5m):(
|
||||
avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
|
||||
( avg:azure.devices_iothubs.c2d.methods.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
|
||||
avg:azure.devices_iothubs.c2d.methods.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
|
||||
) * 100 > ${var.c2d_methods_failed_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
|
||||
thresholds {
|
||||
@ -134,9 +162,15 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
|
||||
|
||||
resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many c2d twin read failure on {{name}}"
|
||||
message = "${var.c2d_twin_read_failed_message}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_read_failed_threshold_critical}"
|
||||
query = <<EOF
|
||||
avg(last_5m):(
|
||||
avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
|
||||
( avg:azure.devices_iothubs.c2d.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
|
||||
avg:azure.devices_iothubs.c2d.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
|
||||
) * 100 > ${var.c2d_twin_read_failed_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
|
||||
thresholds {
|
||||
@ -158,9 +192,15 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
|
||||
|
||||
resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many c2d twin update failure on {{name}}"
|
||||
message = "${var.c2d_twin_update_failed_message}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = "avg(last_5m):( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.c2d.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.c2d_twin_update_failed_threshold_critical}"
|
||||
query = <<EOF
|
||||
avg(last_5m):(
|
||||
avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
|
||||
( avg:azure.devices_iothubs.c2d.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
|
||||
avg:azure.devices_iothubs.c2d.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
|
||||
) * 100 > ${var.c2d_twin_update_failed_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
|
||||
thresholds {
|
||||
@ -182,9 +222,15 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c twin read failure on {{name}}"
|
||||
message = "${var.d2c_twin_read_failed_message}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_read_failed_threshold_critical}"
|
||||
query = <<EOF
|
||||
avg(last_5m):(
|
||||
avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
|
||||
( avg:azure.devices_iothubs.d2c.twin.read.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
|
||||
avg:azure.devices_iothubs.d2c.twin.read.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
|
||||
) * 100 > ${var.d2c_twin_read_failed_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
|
||||
thresholds {
|
||||
@ -206,9 +252,15 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c twin update failure on {{name}}"
|
||||
message = "${var.d2c_twin_update_failed_message}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = "avg(last_5m):( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() / ( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() + avg:azure.devices_iothubs.d2c.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() ) ) * 100 > ${var.d2c_twin_update_failed_threshold_critical}"
|
||||
query = <<EOF
|
||||
avg(last_5m):(
|
||||
avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() /
|
||||
( avg:azure.devices_iothubs.d2c.twin.update.failure{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() +
|
||||
avg:azure.devices_iothubs.d2c.twin.update.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() )
|
||||
) * 100 > ${var.d2c_twin_update_failed_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
|
||||
thresholds {
|
||||
@ -230,9 +282,13 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped on {{name}}"
|
||||
message = "${var.d2c_telemetry_egress_dropped_message}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_dropped_threshold_critical}"
|
||||
query = <<EOF
|
||||
sum(last_5m): (
|
||||
avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
|
||||
) > ${var.d2c_telemetry_egress_dropped_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
|
||||
thresholds {
|
||||
@ -254,9 +310,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned on {{name}}"
|
||||
message = "${var.d2c_telemetry_egress_orphaned_message}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_orphaned_threshold_critical}"
|
||||
query = <<EOF
|
||||
sum(last_5m): (
|
||||
avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
|
||||
) > ${var.d2c_telemetry_egress_orphaned_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
|
||||
thresholds {
|
||||
@ -278,9 +338,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid on {{name}}"
|
||||
message = "${var.d2c_telemetry_egress_invalid_message}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_invalid_threshold_critical}"
|
||||
query = <<EOF
|
||||
sum(last_5m): (
|
||||
avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
|
||||
) > ${var.d2c_telemetry_egress_invalid_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
|
||||
thresholds {
|
||||
@ -302,9 +366,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress fallback on {{name}}"
|
||||
message = "${var.d2c_telemetry_egress_fallback_message}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{subscription_id:${var.subscription_id}} by {name,resource_group} > ${var.d2c_telemetry_egress_fallback_threshold_critical}"
|
||||
query = <<EOF
|
||||
sum(last_5m): (
|
||||
avg:azure.devices_iothubs.d2c.telemetry.egress.fallback{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
|
||||
) > ${var.d2c_telemetry_egress_fallback_threshold_critical}
|
||||
EOF
|
||||
type = "query alert"
|
||||
|
||||
thresholds {
|
||||
@ -326,9 +394,14 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress no sent on {{name}}"
|
||||
message = "${var.d2c_telemetry_ingress_nosent_message}"
|
||||
message = "${var.message}"
|
||||
|
||||
query = "sum(last_5m):avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() - avg:azure.devices_iothubs.d2c.telemetry.ingress.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() > 0"
|
||||
query = <<EOF
|
||||
sum(last_5m): (
|
||||
avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count() -
|
||||
avg:azure.devices_iothubs.d2c.telemetry.ingress.success{subscription_id:${var.subscription_id}} by {name,resource_group}.as_count()
|
||||
) > 0
|
||||
EOF
|
||||
type = "query alert"
|
||||
|
||||
notify_no_data = false
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user