Merged in MON-191-make-the-aggregator-customizable (pull request #87)

MON-191 make the aggregator customizable

Approved-by: Alexandre Gaillet <alexandre.gaillet@fr.clara.net>
Approved-by: Boris Rousseau <boris.rousseau@morea.fr>
Approved-by: Quentin Manfroi <quentin.manfroi@yahoo.fr>
Approved-by: Laurent Piroelle <laurent.piroelle@fr.clara.net>
This commit is contained in:
Alexandre Gaillet 2018-06-28 08:37:52 +00:00 committed by Quentin Manfroi
commit 12fcaba6cc
63 changed files with 1224 additions and 327 deletions

Binary file not shown.

View File

@ -38,10 +38,16 @@ variable "alb_no_healthy_instances_message" {
default = ""
}
variable "alb_no_healthy_instances_time_aggregator" {
description = "Monitor aggregator for ALB no healthy instances [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "alb_no_healthy_instances_timeframe" {
description = "Monitor timeframe for ALB no healthy instances [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_1m"
default = "last_5m"
}
variable "latency_silenced" {
@ -56,6 +62,12 @@ variable "latency_message" {
default = ""
}
variable "latency_time_aggregator" {
description = "Monitor aggregator for ALB latency [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "latency_timeframe" {
description = "Monitor timeframe for ALB latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -72,30 +84,30 @@ variable "latency_threshold_warning" {
description = "latency warning threshold in milliseconds"
}
variable "httpcode_elb_4xx_silenced" {
variable "httpcode_alb_4xx_silenced" {
description = "Groups to mute for ALB httpcode 4xx monitor"
type = "map"
default = {}
}
variable "httpcode_elb_4xx_message" {
variable "httpcode_alb_4xx_message" {
description = "Custom message for ALB httpcode 4xx monitor"
type = "string"
default = ""
}
variable "httpcode_elb_4xx_timeframe" {
variable "httpcode_alb_4xx_timeframe" {
description = "Monitor timeframe for ALB httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "httpcode_elb_4xx_threshold_critical" {
variable "httpcode_alb_4xx_threshold_critical" {
default = 80
description = "loadbalancer 4xx critical threshold in percentage"
}
variable "httpcode_elb_4xx_threshold_warning" {
variable "httpcode_alb_4xx_threshold_warning" {
default = 60
description = "loadbalancer 4xx warning threshold in percentage"
}
@ -128,30 +140,30 @@ variable "httpcode_target_4xx_threshold_warning" {
description = "target 4xx warning threshold in percentage"
}
variable "httpcode_elb_5xx_silenced" {
variable "httpcode_alb_5xx_silenced" {
description = "Groups to mute for ALB httpcode 5xx monitor"
type = "map"
default = {}
}
variable "httpcode_elb_5xx_message" {
variable "httpcode_alb_5xx_message" {
description = "Custom message for ALB httpcode 5xx monitor"
type = "string"
default = ""
}
variable "httpcode_elb_5xx_timeframe" {
variable "httpcode_alb_5xx_timeframe" {
description = "Monitor timeframe for ALB httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "httpcode_elb_5xx_threshold_critical" {
variable "httpcode_alb_5xx_threshold_critical" {
default = 80
description = "loadbalancer 5xxcritical threshold in percentage"
description = "loadbalancer 5xx critical threshold in percentage"
}
variable "httpcode_elb_5xx_threshold_warning" {
variable "httpcode_alb_5xx_threshold_warning" {
default = 60
description = "loadbalancer 5xx warning threshold in percentage"
}

View File

@ -14,9 +14,9 @@ resource "datadog_monitor" "ALB_no_healthy_instances" {
message = "${coalesce(var.alb_no_healthy_instances_message, var.message)}"
query = <<EOF
min(${var.alb_no_healthy_instances_timeframe}): (
${var.alb_no_healthy_instances_time_aggregator}(${var.alb_no_healthy_instances_timeframe}): (
min:aws.applicationelb.healthy_host_count{${data.template_file.filter.rendered}} by {region,loadbalancer}
) <= 0
) < 1
EOF
evaluation_delay = "${var.delay}"
@ -43,8 +43,8 @@ resource "datadog_monitor" "ALB_latency" {
message = "${coalesce(var.latency_message, var.message)}"
query = <<EOF
min(${var.latency_timeframe}): (
min:aws.applicationelb.target_response_time.average{${data.template_file.filter.rendered}} by {region,loadbalancer}
${var.latency_time_aggregator}(${var.latency_timeframe}): (
avg:aws.applicationelb.target_response_time.average{${data.template_file.filter.rendered}} by {region,loadbalancer}
) > ${var.latency_threshold_critical}
EOF
@ -67,26 +67,26 @@ resource "datadog_monitor" "ALB_latency" {
tags = ["env:${var.environment}", "resource:alb", "team:aws", "provider:aws"]
}
resource "datadog_monitor" "ALB_httpcode_elb_5xx" {
resource "datadog_monitor" "ALB_httpcode_5xx" {
name = "[${var.environment}] ALB HTTP code 5xx {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
type = "metric alert"
message = "${coalesce(var.httpcode_elb_5xx_message, var.message)}"
message = "${coalesce(var.httpcode_alb_5xx_message, var.message)}"
query = <<EOF
min(${var.httpcode_elb_5xx_timeframe}): (
sum(${var.httpcode_alb_5xx_timeframe}): (
default(
min:aws.applicationelb.httpcode_elb_5xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
(min:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
avg:aws.applicationelb.httpcode_alb_5xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
(avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
0) * 100
) > ${var.httpcode_elb_5xx_threshold_critical}
) > ${var.httpcode_alb_5xx_threshold_critical}
EOF
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
thresholds {
critical = "${var.httpcode_elb_5xx_threshold_critical}"
warning = "${var.httpcode_elb_5xx_threshold_warning}"
critical = "${var.httpcode_alb_5xx_threshold_critical}"
warning = "${var.httpcode_alb_5xx_threshold_warning}"
}
notify_no_data = false
@ -95,31 +95,31 @@ resource "datadog_monitor" "ALB_httpcode_elb_5xx" {
timeout_h = 0
include_tags = true
silenced = "${var.httpcode_elb_5xx_silenced}"
silenced = "${var.httpcode_alb_5xx_silenced}"
tags = ["env:${var.environment}", "resource:alb", "team:aws", "provider:aws"]
}
resource "datadog_monitor" "ALB_httpcode_elb_4xx" {
resource "datadog_monitor" "ALB_httpcode_4xx" {
name = "[${var.environment}] ALB HTTP code 4xx {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
type = "metric alert"
message = "${coalesce(var.httpcode_elb_4xx_message, var.message)}"
message = "${coalesce(var.httpcode_alb_4xx_message, var.message)}"
query = <<EOF
min(${var.httpcode_elb_4xx_timeframe}): (
sum(${var.httpcode_alb_4xx_timeframe}): (
default(
min:aws.applicationelb.httpcode_elb_4xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
(min:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
avg:aws.applicationelb.httpcode_alb_4xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
(avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
0) * 100
) > ${var.httpcode_elb_4xx_threshold_critical}
) > ${var.httpcode_alb_4xx_threshold_critical}
EOF
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
thresholds {
critical = "${var.httpcode_elb_4xx_threshold_critical}"
warning = "${var.httpcode_elb_4xx_threshold_warning}"
critical = "${var.httpcode_alb_4xx_threshold_critical}"
warning = "${var.httpcode_alb_4xx_threshold_warning}"
}
notify_no_data = false
@ -128,7 +128,7 @@ resource "datadog_monitor" "ALB_httpcode_elb_4xx" {
timeout_h = 0
include_tags = true
silenced = "${var.httpcode_elb_4xx_silenced}"
silenced = "${var.httpcode_alb_4xx_silenced}"
tags = ["env:${var.environment}", "resource:alb", "team:aws", "provider:aws"]
}
@ -139,10 +139,10 @@ resource "datadog_monitor" "ALB_httpcode_target_5xx" {
message = "${coalesce(var.httpcode_target_5xx_message, var.message)}"
query = <<EOF
min(${var.httpcode_target_5xx_timeframe}): (
sum(${var.httpcode_target_5xx_timeframe}): (
default(
min:aws.applicationelb.httpcode_target_5xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
(min:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
avg:aws.applicationelb.httpcode_target_5xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
(avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
0) * 100
) > ${var.httpcode_target_5xx_threshold_critical}
EOF
@ -172,10 +172,10 @@ resource "datadog_monitor" "ALB_httpcode_target_4xx" {
message = "${coalesce(var.httpcode_target_4xx_message, var.message)}"
query = <<EOF
min(${var.httpcode_target_4xx_timeframe}): (
sum(${var.httpcode_target_4xx_timeframe}): (
default(
min:aws.applicationelb.httpcode_target_4xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
(min:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
avg:aws.applicationelb.httpcode_target_4xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
(avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
0) * 100
) > ${var.httpcode_target_4xx_threshold_critical}
EOF

Binary file not shown.

View File

@ -33,6 +33,12 @@ variable "latency_message" {
default = ""
}
variable "latency_time_aggregator" {
description = "Monitor aggregator for API Gateway latency [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "latency_timeframe" {
description = "Monitor timeframe for API latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -5,8 +5,8 @@ resource "datadog_monitor" "API_Gateway_latency" {
message = "${coalesce(var.latency_message, var.message)}"
query = <<EOF
min(${var.latency_timeframe}): (
min:aws.apigateway.latency{${var.filter_tags}} by {region,apiname}
${var.latency_time_aggregator}(${var.latency_timeframe}): (
avg:aws.apigateway.latency{${var.filter_tags}} by {region,apiname}
) > ${var.latency_threshold_critical}
EOF
@ -36,10 +36,10 @@ resource "datadog_monitor" "API_http_5xx_errors_count" {
message = "${coalesce(var.http_5xx_requests_message, var.message)}"
query = <<EOF
min(${var.http_5xx_requests_timeframe}): (
sum(${var.http_5xx_requests_timeframe}): (
default(
min:aws.apigateway.5xxerror{${var.filter_tags}} by {region,apiname}.as_count() /
(min:aws.apigateway.count{${var.filter_tags}} by {region,apiname}.as_count() + ${var.artificial_requests_count}),
avg:aws.apigateway.5xxerror{${var.filter_tags}} by {region,apiname}.as_count() /
(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname}.as_count() + ${var.artificial_requests_count}),
0) * 100
) > ${var.http_5xx_requests_threshold_critical}
EOF
@ -70,10 +70,10 @@ resource "datadog_monitor" "API_http_4xx_errors_count" {
message = "${coalesce(var.http_4xx_requests_message, var.message)}"
query = <<EOF
min(${var.http_4xx_requests_timeframe}): (
sum(${var.http_4xx_requests_timeframe}): (
default(
min:aws.apigateway.4xxerror{${var.filter_tags}} by {region,apiname}.as_count() /
(min:aws.apigateway.count{${var.filter_tags}} by {region,apiname}.as_count() + ${var.artificial_requests_count}),
avg:aws.apigateway.4xxerror{${var.filter_tags}} by {region,apiname}.as_count() /
(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname}.as_count() + ${var.artificial_requests_count}),
0) * 100
) > ${var.http_4xx_requests_threshold_critical}
EOF

Binary file not shown.

View File

@ -60,6 +60,12 @@ variable "diskspace_message" {
default = ""
}
variable "diskspace_time_aggregator" {
description = "Monitor aggregator for ES cluster diskspace [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "diskspace_timeframe" {
description = "Monitor timeframe for ES cluster diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -88,6 +94,12 @@ variable "cpu_message" {
default = ""
}
variable "cpu_time_aggregator" {
description = "Monitor aggregator for ES cluster cpu [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "cpu_timeframe" {
description = "Monitor timeframe for ES cluster cpu [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -52,7 +52,7 @@ resource "datadog_monitor" "es_free_space_low" {
type = "metric alert"
query = <<EOF
avg(${var.diskspace_timeframe}): (
${var.diskspace_time_aggregator}(${var.diskspace_timeframe}): (
avg:aws.es.free_storage_space{${data.template_file.filter.rendered}} by {region,name} /
(${var.es_cluster_volume_size}*1000) * 100
) < ${var.diskspace_threshold_critical}
@ -86,7 +86,7 @@ resource "datadog_monitor" "es_cpu_90_15min" {
type = "metric alert"
query = <<EOF
avg(${var.cpu_timeframe}): (
${var.cpu_time_aggregator}(${var.cpu_timeframe}): (
avg:aws.es.cpuutilization{${data.template_file.filter.rendered}} by {region,name}
) > ${var.cpu_threshold_critical}
EOF

View File

@ -25,42 +25,45 @@ Creates DataDog monitors with the following checks :
* ELB backend http code 4xx percent to high
* ELB backend http code 5xx percent to high
Inputs
------
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| artificial_requests_count | Number of false requests used to mitigate false positive in case of low trafic | string | `5` | no |
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
| elb_4xx_message | Custom message for ELB 4xx errors monitor | string | `` | no |
| elb_4xx_silenced | Groups to mute for ELB 4xx errors monitor | map | `<map>` | no |
| elb_4xx_threshold_critical | loadbalancer 4xx critical threshold in percentage | string | `10` | no |
| elb_4xx_threshold_warning | loadbalancer 4xx warning threshold in percentage | string | `5` | no |
| elb_4xx_timeframe | Monitor timeframe for ELB 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| elb_5xx_message | Custom message for ELB 5xx errors monitor | string | `` | no |
| elb_5xx_silenced | Groups to mute for ELB 5xx errors monitor | map | `<map>` | no |
| elb_5xx_threshold_critical | loadbalancer 5xx critical threshold in percentage | string | `10` | no |
| elb_5xx_threshold_warning | loadbalancer 5xx warning threshold in percentage | string | `5` | no |
| elb_5xx_timeframe | Monitor timeframe for ELB 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| elb_backend_4xx_message | Custom message for ELB backend 4xx errors monitor | string | `` | no |
| elb_backend_4xx_silenced | Groups to mute for ELB backend 4xx errors monitor | map | `<map>` | no |
| elb_backend_4xx_threshold_critical | loadbalancer backend 4xx critical threshold in percentage | string | `10` | no |
| elb_backend_4xx_threshold_warning | loadbalancer backend 4xx warning threshold in percentage | string | `5` | no |
| elb_backend_4xx_timeframe | Monitor timeframe for ELB backend 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| elb_backend_5xx_message | Custom message for ELB backend 5xx errors monitor | string | `` | no |
| elb_backend_5xx_silenced | Groups to mute for ELB backend 5xx errors monitor | map | `<map>` | no |
| elb_backend_5xx_threshold_critical | loadbalancer backend 5xx critical threshold in percentage | string | `10` | no |
| elb_backend_5xx_threshold_warning | loadbalancer backend 5xx warning threshold in percentage | string | `5` | no |
| elb_backend_5xx_timeframe | Monitor timeframe for ELB backend 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| elb_backend_latency_critical | latency critical threshold in seconds | string | `5` | no |
| elb_backend_latency_message | Custom message for ELB backend latency monitor | string | `` | no |
| elb_backend_latency_silenced | Groups to mute for ELB backend latency monitor | map | `<map>` | no |
| elb_backend_latency_timeframe | Monitor timeframe for ELB backend latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| elb_backend_latency_warning | latency warning threshold in seconds | string | `1` | no |
| elb_no_healthy_instance_message | Custom message for ELB no healty instance monitor | string | `` | no |
| elb_no_healthy_instance_silenced | Groups to mute for ELB no healty instance monitor | map | `<map>` | no |
| elb_no_healthy_instance_timeframe | Monitor timeframe for ELB no healty instance [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| environment | Architecture Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| message | Message sent when an alert is triggered | string | - | yes |
Inputs
------
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| artificial_requests_count | Number of false requests used to mitigate false positive in case of low trafic | string | `5` | no |
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
| elb_4xx_message | Custom message for ELB 4xx errors monitor | string | `` | no |
| elb_4xx_silenced | Groups to mute for ELB 4xx errors monitor | map | `<map>` | no |
| elb_4xx_threshold_critical | loadbalancer 4xx critical threshold in percentage | string | `10` | no |
| elb_4xx_threshold_warning | loadbalancer 4xx warning threshold in percentage | string | `5` | no |
| elb_4xx_timeframe | Monitor timeframe for ELB 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| elb_5xx_message | Custom message for ELB 5xx errors monitor | string | `` | no |
| elb_5xx_silenced | Groups to mute for ELB 5xx errors monitor | map | `<map>` | no |
| elb_5xx_threshold_critical | loadbalancer 5xx critical threshold in percentage | string | `10` | no |
| elb_5xx_threshold_warning | loadbalancer 5xx warning threshold in percentage | string | `5` | no |
| elb_5xx_timeframe | Monitor timeframe for ELB 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| elb_backend_4xx_message | Custom message for ELB backend 4xx errors monitor | string | `` | no |
| elb_backend_4xx_silenced | Groups to mute for ELB backend 4xx errors monitor | map | `<map>` | no |
| elb_backend_4xx_threshold_critical | loadbalancer backend 4xx critical threshold in percentage | string | `10` | no |
| elb_backend_4xx_threshold_warning | loadbalancer backend 4xx warning threshold in percentage | string | `5` | no |
| elb_backend_4xx_timeframe | Monitor timeframe for ELB backend 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| elb_backend_5xx_message | Custom message for ELB backend 5xx errors monitor | string | `` | no |
| elb_backend_5xx_silenced | Groups to mute for ELB backend 5xx errors monitor | map | `<map>` | no |
| elb_backend_5xx_threshold_critical | loadbalancer backend 5xx critical threshold in percentage | string | `10` | no |
| elb_backend_5xx_threshold_warning | loadbalancer backend 5xx warning threshold in percentage | string | `5` | no |
| elb_backend_5xx_timeframe | Monitor timeframe for ELB backend 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| elb_backend_latency_critical | latency critical threshold in seconds | string | `5` | no |
| elb_backend_latency_message | Custom message for ELB backend latency monitor | string | `` | no |
| elb_backend_latency_silenced | Groups to mute for ELB backend latency monitor | map | `<map>` | no |
| elb_backend_latency_time_aggregator | Monitor aggregator for ELB backend latency [available values: min, max or avg] | string | `min` | no |
| elb_backend_latency_timeframe | Monitor timeframe for ELB backend latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| elb_backend_latency_warning | latency warning threshold in seconds | string | `1` | no |
| elb_no_healthy_instance_message | Custom message for ELB no healty instance monitor | string | `` | no |
| elb_no_healthy_instance_silenced | Groups to mute for ELB no healty instance monitor | map | `<map>` | no |
| elb_no_healthy_instance_time_aggregator | Monitor aggregator for ELB no healty instance [available values: min or max] | string | `min` | no |
| elb_no_healthy_instance_timeframe | Monitor timeframe for ELB no healty instance [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| environment | Architecture Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| message | Message sent when an alert is triggered | string | - | yes |

View File

@ -37,6 +37,12 @@ variable "elb_no_healthy_instance_message" {
default = ""
}
variable "elb_no_healthy_instance_time_aggregator" {
description = "Monitor aggregator for ELB no healty instance [available values: min or max]"
type = "string"
default = "min"
}
variable "elb_no_healthy_instance_timeframe" {
description = "Monitor timeframe for ELB no healty instance [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -167,6 +173,12 @@ variable "elb_backend_latency_message" {
default = ""
}
variable "elb_backend_latency_time_aggregator" {
description = "Monitor aggregator for ELB backend latency [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "elb_backend_latency_timeframe" {
description = "Monitor timeframe for ELB backend latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -11,7 +11,7 @@ resource "datadog_monitor" "ELB_no_healthy_instances" {
message = "${coalesce(var.elb_no_healthy_instance_message, var.message)}"
query = <<EOF
min(${var.elb_no_healthy_instance_timeframe}): (
${var.elb_no_healthy_instance_time_aggregator}(${var.elb_no_healthy_instance_timeframe}): (
min:aws.elb.healthy_host_count{${data.template_file.filter.rendered}} by {region,loadbalancername}
) < 1
EOF
@ -38,10 +38,10 @@ resource "datadog_monitor" "ELB_too_much_4xx" {
message = "${coalesce(var.elb_4xx_message, var.message)}"
query = <<EOF
min(${var.elb_4xx_timeframe}): (
sum(${var.elb_4xx_timeframe}): (
default(
min:aws.elb.httpcode_elb_4xx{${data.template_file.filter.rendered}} by {region,loadbalancername}.as_count() /
(min:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername}.as_count() + ${var.artificial_requests_count}),
avg:aws.elb.httpcode_elb_4xx{${data.template_file.filter.rendered}} by {region,loadbalancername}.as_count() /
(avg:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername}.as_count() + ${var.artificial_requests_count}),
0) * 100
) > ${var.elb_4xx_threshold_critical}
EOF
@ -73,10 +73,10 @@ resource "datadog_monitor" "ELB_too_much_5xx" {
message = "${coalesce(var.elb_5xx_message, var.message)}"
query = <<EOF
min(${var.elb_5xx_timeframe}): (
sum(${var.elb_5xx_timeframe}): (
default(
min:aws.elb.httpcode_elb_5xx{${data.template_file.filter.rendered}} by {region,loadbalancername} /
(min:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername} + ${var.artificial_requests_count}),
avg:aws.elb.httpcode_elb_5xx{${data.template_file.filter.rendered}} by {region,loadbalancername} /
(avg:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername} + ${var.artificial_requests_count}),
0) * 100
) > ${var.elb_5xx_threshold_critical}
EOF
@ -108,10 +108,10 @@ resource "datadog_monitor" "ELB_too_much_4xx_backend" {
message = "${coalesce(var.elb_backend_4xx_message, var.message)}"
query = <<EOF
min(${var.elb_backend_4xx_timeframe}): (
sum(${var.elb_backend_4xx_timeframe}): (
default(
min:aws.elb.httpcode_backend_4xx{${data.template_file.filter.rendered}} by {region,loadbalancername} /
(min:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername} + ${var.artificial_requests_count}),
avg:aws.elb.httpcode_backend_4xx{${data.template_file.filter.rendered}} by {region,loadbalancername} /
(avg:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername} + ${var.artificial_requests_count}),
0) * 100
) > ${var.elb_backend_4xx_threshold_critical}
EOF
@ -143,10 +143,10 @@ resource "datadog_monitor" "ELB_too_much_5xx_backend" {
message = "${coalesce(var.elb_backend_5xx_message, var.message)}"
query = <<EOF
min(${var.elb_backend_5xx_timeframe}): (
sum(${var.elb_backend_5xx_timeframe}): (
default(
min:aws.elb.httpcode_backend_5xx{${data.template_file.filter.rendered}} by {region,loadbalancername} /
(min:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername} + ${var.artificial_requests_count}),
avg:aws.elb.httpcode_backend_5xx{${data.template_file.filter.rendered}} by {region,loadbalancername} /
(avg:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername} + ${var.artificial_requests_count}),
0) * 100
) > ${var.elb_backend_5xx_threshold_critical}
EOF
@ -178,8 +178,8 @@ resource "datadog_monitor" "ELB_backend_latency" {
message = "${coalesce(var.elb_backend_latency_message, var.message)}"
query = <<EOF
min(${var.elb_backend_latency_warning}): (
min:aws.elb.latency{${data.template_file.filter.rendered}} by {region,loadbalancername}
${var.elb_backend_latency_time_aggregator}(${var.elb_backend_latency_timeframe}): (
avg:aws.elb.latency{${data.template_file.filter.rendered}} by {region,loadbalancername}
) > ${var.elb_backend_latency_critical}
EOF

Binary file not shown.

View File

@ -14,10 +14,10 @@ resource "datadog_monitor" "firehose_incoming_records" {
type = "metric alert"
query = <<EOF
sum(${var.incoming_records_timeframe}): (
avg:aws.firehose.incoming_records{${data.template_file.filter.rendered}} by {region,deliverystreamname}
) <= 0
EOF
sum(${var.incoming_records_timeframe}): (
avg:aws.firehose.incoming_records{${data.template_file.filter.rendered}} by {region,deliverystreamname}
) <= 0
EOF
thresholds {
critical = 0

Binary file not shown.

View File

@ -38,6 +38,12 @@ variable "cpu_message" {
default = ""
}
variable "cpu_time_aggregator" {
description = "Monitor aggregator for RDS CPU usage [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "cpu_timeframe" {
description = "Monitor timeframe for RDS CPU usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -66,6 +72,12 @@ variable "diskspace_message" {
default = ""
}
variable "diskspace_time_aggregator" {
description = "Monitor aggregator for RDS free diskspace [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "diskspace_timeframe" {
description = "Monitor timeframe for RDS free diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -14,7 +14,7 @@ resource "datadog_monitor" "rds_cpu_90_15min" {
type = "metric alert"
query = <<EOF
avg(${var.cpu_timeframe}): (
${var.cpu_time_aggregator}(${var.cpu_timeframe}): (
avg:aws.rds.cpuutilization{${data.template_file.filter.rendered}} by {region,name}
) > ${var.cpu_threshold_critical}
EOF
@ -46,7 +46,7 @@ resource "datadog_monitor" "rds_free_space_low" {
type = "metric alert"
query = <<EOF
avg(${var.diskspace_timeframe}): (
${var.diskspace_time_aggregator}(${var.diskspace_timeframe}): (
avg:aws.rds.free_storage_space{${data.template_file.filter.rendered}} by {region,name} /
avg:aws.rds.total_storage_space{${data.template_file.filter.rendered}} by {region,name} * 100
) < ${var.diskspace_threshold_critical}

Binary file not shown.

View File

@ -36,6 +36,12 @@ variable "vpn_status_message" {
default = ""
}
variable "vpn_status_time_aggregator" {
description = "Monitor aggregator for VPN status [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "vpn_status_timeframe" {
description = "Monitor timeframe for VPN status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -11,7 +11,7 @@ resource "datadog_monitor" "VPN_status" {
message = "${coalesce(var.vpn_status_message, var.message)}"
query = <<EOF
avg(${var.vpn_status_timeframe}): (
${var.vpn_status_time_aggregator}(${var.vpn_status_timeframe}): (
avg:aws.vpn.tunnel_state{${data.template_file.filter.rendered}} by {region,name}
) < 1
EOF

View File

@ -36,193 +36,268 @@ Inputs
| apimanagement_failed_requests_silenced | Groups to mute for API Management failed requests monitor | map | `<map>` | no |
| apimanagement_failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `90` | no |
| apimanagement_failed_requests_threshold_warning | Warning regarding acceptable percent of failed requests | string | `50` | no |
| apimanagement_failed_requests_timeframe | Monitor timeframe for API Management failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| apimanagement_other_requests_message | Custom message for API Management other requests monitor | string | `` | no |
| apimanagement_other_requests_silenced | Groups to mute for API Management other requests monitor | map | `<map>` | no |
| apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `90` | no |
| apimanagement_other_requests_threshold_warning | Warning regarding acceptable percent of other requests | string | `50` | no |
| apimanagement_other_requests_timeframe | Monitor timeframe for API Management other requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| apimanagement_status_message | Custom message for API Management status monitor | string | `` | no |
| apimanagement_status_silenced | Groups to mute for API Management status monitor | map | `<map>` | no |
| apimanagement_status_time_aggregator | Monitor aggregator for API Management status [available values: min, max or avg] | string | `max` | no |
| apimanagement_status_timeframe | Monitor timeframe for API Management status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| apimanagement_successful_requests_message | Custom message for API Management successful requests monitor | string | `` | no |
| apimanagement_successful_requests_silenced | Groups to mute for API Management successful requests monitor | map | `<map>` | no |
| apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `10` | no |
| apimanagement_successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests | string | `30` | no |
| apimanagement_successful_requests_timeframe | Monitor timeframe for API Management successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| apimanagement_unauthorized_requests_message | Custom message for API Management unauthorized requests monitor | string | `` | no |
| apimanagement_unauthorized_requests_silenced | Groups to mute for API Management unauthorized requests monitor | map | `<map>` | no |
| apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `90` | no |
| apimanagement_unauthorized_requests_threshold_warning | Warning regarding acceptable percent of unauthorized requests | string | `50` | no |
| apimanagement_unauthorized_requests_timeframe | Monitor timeframe for API Management unauthorized requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| appservices_http_4xx_requests_message | Custom message for App Services 4xx requests monitor | string | `` | no |
| appservices_http_4xx_requests_silenced | Groups to mute for App Services 4xx requests monitor | map | `<map>` | no |
| appservices_http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `90` | no |
| appservices_http_4xx_requests_threshold_warning | Warning regarding acceptable percent of 4xx errors | string | `50` | no |
| appservices_http_4xx_requests_timeframe | Monitor timeframe for App Services 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| appservices_http_5xx_requests_message | Custom message for App Services 5xx requests monitor | string | `` | no |
| appservices_http_5xx_requests_silenced | Groups to mute for App Services 5xx requests monitor | map | `<map>` | no |
| appservices_http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `90` | no |
| appservices_http_5xx_requests_threshold_warning | Warning regarding acceptable percent of 5xx errors | string | `50` | no |
| appservices_http_5xx_requests_timeframe | Monitor timeframe for App Services 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| appservices_http_successful_requests_message | Custom message for App Services successful requests monitor | string | `` | no |
| appservices_http_successful_requests_silenced | Groups to mute for App Services successful requests monitor | map | `<map>` | no |
| appservices_http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `10` | no |
| appservices_http_successful_requests_threshold_warning | Warning regarding acceptable percent of 2xx & 3xx requests | string | `30` | no |
| appservices_http_successful_requests_timeframe | Monitor timeframe for App Services successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| appservices_memory_usage_message | Custom message for App Services memory usage monitor | string | `` | no |
| appservices_memory_usage_silenced | Groups to mute for App Services memory usage monitor | map | `<map>` | no |
| appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `1073741824` | no |
| appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `536870912` | no |
| appservices_memory_usage_time_aggregator | Monitor aggregator for App Services memory usage [available values: min, max or avg] | string | `min` | no |
| appservices_memory_usage_timeframe | Monitor timeframe for App Services memory usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| appservices_response_time_message | Custom message for App Services response time monitor | string | `` | no |
| appservices_response_time_silenced | Groups to mute for App Services response time monitor | map | `<map>` | no |
| appservices_response_time_threshold_critical | Alerting threshold for response time in seconds | string | `10` | no |
| appservices_response_time_threshold_warning | Warning threshold for response time in seconds | string | `5` | no |
| appservices_response_time_time_aggregator | Monitor aggregator for App Services response time [available values: min, max or avg] | string | `min` | no |
| appservices_response_time_timeframe | Monitor timeframe for App Services response time [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
| environment | Architecture environment | string | - | yes |
| eventhub_errors_rate_message | Custom message for Event Hub errors monitor | string | `` | no |
| eventhub_errors_rate_silenced | Groups to mute for Event Hub errors monitor | map | `<map>` | no |
| eventhub_errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `90` | no |
| eventhub_errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `50` | no |
| eventhub_errors_rate_timeframe | Monitor timeframe for Event Hub errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| eventhub_failed_requests_rate_message | Custom message for Event Hub failed requests monitor | string | `` | no |
| eventhub_failed_requests_rate_silenced | Groups to mute for Event Hub failed requests monitor | map | `<map>` | no |
| eventhub_failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `90` | no |
| eventhub_failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `50` | no |
| eventhub_failed_requests_rate_timeframe | Monitor timeframe for Event Hub failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| eventhub_status_message | Custom message for Event Hub status monitor | string | `` | no |
| eventhub_status_silenced | Groups to mute for Event Hub status monitor | map | `<map>` | no |
| eventhub_status_time_aggregator | Monitor aggregator for Event Hub status [available values: min, max or avg] | string | `max` | no |
| eventhub_status_timeframe | Monitor timeframe for Event Hub status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| iothub_dropped_d2c_telemetry_egress_message | Custom message for IoT Hub dropped d2c telemetry monitor | string | `` | no |
| iothub_dropped_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `90` | no |
| iothub_dropped_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `50` | no |
| iothub_dropped_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub dropped d2c telemetry monitor | map | `<map>` | no |
| iothub_dropped_d2c_telemetry_egress_timeframe | Monitor timeframe for IoT Hub dropped d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| iothub_failed_c2d_methods_rate_message | Custom message for IoT Hub failed c2d method monitor | string | `` | no |
| iothub_failed_c2d_methods_rate_silenced | Groups to mute for IoT Hub failed c2d methods monitor | map | `<map>` | no |
| iothub_failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `50` | no |
| iothub_failed_c2d_methods_rate_timeframe | Monitor timeframe for IoT Hub failed c2d method [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| iothub_failed_c2d_twin_read_rate_message | Custom message for IoT Hub failed c2d twin read monitor | string | `` | no |
| iothub_failed_c2d_twin_read_rate_silenced | Groups to mute for IoT Hub failed c2d twin read monitor | map | `<map>` | no |
| iothub_failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `50` | no |
| iothub_failed_c2d_twin_read_rate_timeframe | Monitor timeframe for IoT Hub failed c2d twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| iothub_failed_c2d_twin_update_rate_message | Custom message for IoT Hub failed c2d twin update monitor | string | `` | no |
| iothub_failed_c2d_twin_update_rate_silenced | Groups to mute for IoT Hub failed c2d twin update monitor | map | `<map>` | no |
| iothub_failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `50` | no |
| iothub_failed_c2d_twin_update_rate_timeframe | Monitor timeframe for IoT Hub failed c2d twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| iothub_failed_d2c_twin_read_rate_message | Custom message for IoT Hub failed d2c twin read monitor | string | `` | no |
| iothub_failed_d2c_twin_read_rate_silenced | Groups to mute for IoT Hub failed d2c twin read monitor | map | `<map>` | no |
| iothub_failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `50` | no |
| iothub_failed_d2c_twin_read_rate_timeframe | Monitor timeframe for IoT Hub failed d2c twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| iothub_failed_d2c_twin_update_rate_message | Custom message for IoT Hub failed d2c twin update monitor | string | `` | no |
| iothub_failed_d2c_twin_update_rate_silenced | Groups to mute for IoT Hub failed d2c twin update monitor | map | `<map>` | no |
| iothub_failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `50` | no |
| iothub_failed_d2c_twin_update_rate_timeframe | Monitor timeframe for IoT Hub failed d2c twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| iothub_failed_jobs_rate_message | Custom message for IoT Hub failed jobs monitor | string | `` | no |
| iothub_failed_jobs_rate_silenced | Groups to mute for IoT Hub failed jobs monitor | map | `<map>` | no |
| iothub_failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `50` | no |
| iothub_failed_jobs_rate_timeframe | Monitor timeframe for IoT Hub failed jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| iothub_failed_listjobs_rate_message | Custom message for IoT Hub failed list jobs monitor | string | `` | no |
| iothub_failed_listjobs_rate_silenced | Groups to mute for IoT Hub failed list jobs monitor | map | `<map>` | no |
| iothub_failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `50` | no |
| iothub_failed_listjobs_rate_timeframe | Monitor timeframe for IoT Hub failed list jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| iothub_failed_queryjobs_rate_message | Custom message for IoT Hub failed query jobs monitor | string | `` | no |
| iothub_failed_queryjobs_rate_silenced | Groups to mute for IoT Hub failed query jobs monitor | map | `<map>` | no |
| iothub_failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `90` | no |
| iothub_failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no |
| iothub_failed_queryjobs_rate_timeframe | Monitor timeframe for IoT Hub failed query jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| iothub_invalid_d2c_telemetry_egress_message | Custom message for IoT Hub invalid d2c telemetry monitor | string | `` | no |
| iothub_invalid_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `90` | no |
| iothub_invalid_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `50` | no |
| iothub_invalid_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub invalid d2c telemetry monitor | map | `<map>` | no |
| iothub_invalid_d2c_telemetry_egress_timeframe | Monitor timeframe for IoT Hub invalid d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| iothub_orphaned_d2c_telemetry_egress_message | Custom message for IoT Hub orphaned d2c telemetry monitor | string | `` | no |
| iothub_orphaned_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `90` | no |
| iothub_orphaned_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `50` | no |
| iothub_orphaned_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub orphaned d2c telemetry monitor | map | `<map>` | no |
| iothub_orphaned_d2c_telemetry_egress_timeframe | Monitor timeframe for IoT Hub orphaned d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| iothub_status_message | Custom message for IoT Hub status monitor | string | `` | no |
| iothub_status_silenced | Groups to mute for IoT Hub status monitor | map | `<map>` | no |
| iothub_status_time_aggregator | Monitor aggregator for IoT Hub status [available values: min, max or avg] | string | `max` | no |
| iothub_status_timeframe | Monitor timeframe for IoT Hub status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| iothub_too_many_d2c_telemetry_ingress_nosent_message | Custom message for IoT Hub unsent d2c telemetry monitor | string | `` | no |
| iothub_too_many_d2c_telemetry_ingress_nosent_silenced | Groups to mute for IoT Hub unsent d2c telemetry monitor | map | `<map>` | no |
| iothub_too_many_d2c_telemetry_ingress_nosent_timeframe | Monitor timeframe for IoT Hub unsent d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| iothub_total_devices_message | Custom message for IoT Hub total devices monitor | string | `` | no |
| iothub_total_devices_silenced | Groups to mute for IoT Hub total devices monitor | map | `<map>` | no |
| iothub_total_devices_time_aggregator | Monitor aggregator for IoT Hub total devices [available values: min, max or avg] | string | `min` | no |
| iothub_total_devices_timeframe | Monitor timeframe for IoT Hub total devices [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| message | Message sent when a monitor is triggered | string | - | yes |
| non_taggable_filter_tags | Tags used for filtering for components without tag support | string | `*` | no |
| redis_evictedkeys_limit_message | Custom message for Redis evicted keys monitor | string | `` | no |
| redis_evictedkeys_limit_silenced | Groups to mute for Redis evicted keys monitor | map | `<map>` | no |
| redis_evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no |
| redis_evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no |
| redis_evictedkeys_limit_time_aggregator | Monitor aggregator for Redis evicted keys [available values: min, max or avg] | string | `avg` | no |
| redis_evictedkeys_limit_timeframe | Monitor timeframe for Redis evicted keys [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| redis_percent_processor_time_message | Custom message for Redis processor monitor | string | `` | no |
| redis_percent_processor_time_silenced | Groups to mute for Redis processor monitor | map | `<map>` | no |
| redis_percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no |
| redis_percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no |
| redis_percent_processor_time_time_aggregator | Monitor aggregator for Redis processor [available values: min, max or avg] | string | `min` | no |
| redis_percent_processor_time_timeframe | Monitor timeframe for Redis processor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| redis_server_load_rate_message | Custom message for Redis server load monitor | string | `` | no |
| redis_server_load_rate_silenced | Groups to mute for Redis server load monitor | map | `<map>` | no |
| redis_server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no |
| redis_server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no |
| redis_server_load_rate_time_aggregator | Monitor aggregator for Redis server load [available values: min, max or avg] | string | `min` | no |
| redis_server_load_rate_timeframe | Monitor timeframe for Redis server load [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| redis_status_message | Custom message for Redis status monitor | string | `` | no |
| redis_status_silenced | Groups to mute for Redis status monitor | map | `<map>` | no |
| redis_status_time_aggregator | Monitor aggregator for Redis status [available values: min, max or avg] | string | `max` | no |
| redis_status_timeframe | Monitor timeframe for Redis status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| servicebus_status_message | Custom message for Service Bus status monitor | string | `` | no |
| servicebus_status_silenced | Groups to mute for Service Bus status monitor | map | `<map>` | no |
| servicebus_status_time_aggregator | Monitor aggregator for Service Bus status [available values: min, max or avg] | string | `max` | no |
| servicebus_status_timeframe | Monitor timeframe for Service Bus status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
| servicebus_status_aggregator | Monitor timeframe aggregator for Service Bus status [available values: min, max, sum or avg] | string | `min` | no |
| sqldatabase_cpu_message | Custom message for SQL CPU monitor | string | `` | no |
| sqldatabase_cpu_silenced | Groups to mute for SQL CPU monitor | map | `<map>` | no |
| sqldatabase_cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no |
| sqldatabase_cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no |
| sqldatabase_cpu_time_aggregator | Monitor aggregator for SQL CPU [available values: min, max or avg] | string | `min` | no |
| sqldatabase_cpu_timeframe | Monitor timeframe for SQL CPU [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
| sqldatabase_deadlock_message | Custom message for SQL Deadlock monitor | string | `` | no |
| sqldatabase_deadlock_silenced | Groups to mute for SQL Deadlock monitor | map | `<map>` | no |
| sqldatabase_deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no |
| sqldatabase_deadlock_timeframe | Monitor timeframe for SQL Deadlock [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| sqldatabase_diskspace_message | Custom message for SQL disk space monitor | string | `` | no |
| sqldatabase_diskspace_silenced | Groups to mute for SQL disk space monitor | map | `<map>` | no |
| sqldatabase_diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no |
| sqldatabase_diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no |
| sqldatabase_diskspace_time_aggregator | Monitor aggregator for SQL disk space [available values: min, max or avg] | string | `max` | no |
| sqldatabase_diskspace_timeframe | Monitor timeframe for SQL disk space [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
| sqldatabase_dtu_message | Custom message for SQL DTU monitor | string | `` | no |
| sqldatabase_dtu_silenced | Groups to mute for SQL DTU monitor | map | `<map>` | no |
| sqldatabase_dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no |
| sqldatabase_dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no |
| sqldatabase_dtu_time_aggregator | Monitor aggregator for SQL DTU [available values: min, max or avg] | string | `avg` | no |
| sqldatabase_dtu_timeframe | Monitor timeframe for SQL DTU [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
| storage_authorization_error_requests_message | Custom message for Storage authorization errors monitor | string | `` | no |
| storage_authorization_error_requests_silenced | Groups to mute for Storage authorization errors monitor | map | `<map>` | no |
| storage_authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `90` | no |
| storage_authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `50` | no |
| storage_authorization_error_requests_time_aggregator | Monitor aggregator for Storage authorization errors [available values: min, max or avg] | string | `min` | no |
| storage_authorization_error_requests_timeframe | Monitor timeframe for Storage authorization errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| storage_availability_message | Custom message for Storage availability monitor | string | `` | no |
| storage_availability_silenced | Groups to mute for Storage availability monitor | map | `<map>` | no |
| storage_availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `50` | no |
| storage_availability_threshold_warning | Warning regarding acceptable percent of availability for a storage | string | `90` | no |
| storage_availability_time_aggregator | Monitor aggregator for Storage availability [available values: min, max or avg] | string | `max` | no |
| storage_availability_timeframe | Monitor timeframe for Storage availability [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| storage_client_other_error_requests_message | Custom message for Storage other errors monitor | string | `` | no |
| storage_client_other_error_requests_silenced | Groups to mute for Storage other errors monitor | map | `<map>` | no |
| storage_client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `90` | no |
| storage_client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `50` | no |
| storage_client_other_error_requests_time_aggregator | Monitor aggregator for Storage other errors [available values: min, max or avg] | string | `min` | no |
| storage_client_other_error_requests_timeframe | Monitor timeframe for Storage other errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| storage_latency_message | Custom message for Storage latency monitor | string | `` | no |
| storage_latency_silenced | Groups to mute for Storage latency monitor | map | `<map>` | no |
| storage_latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `2000` | no |
| storage_latency_threshold_warning | Warning regarding acceptable end to end latency (ms) for a storage | string | `1000` | no |
| storage_latency_time_aggregator | Monitor aggregator for Storage latency [available values: min, max or avg] | string | `min` | no |
| storage_latency_timeframe | Monitor timeframe for Storage latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| storage_network_error_requests_message | Custom message for Storage network errors monitor | string | `` | no |
| storage_network_error_requests_silenced | Groups to mute for Storage network errors monitor | map | `<map>` | no |
| storage_network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `90` | no |
| storage_network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `50` | no |
| storage_network_error_requests_time_aggregator | Monitor aggregator for Storage network errors [available values: min, max or avg] | string | `min` | no |
| storage_network_error_requests_timeframe | Monitor timeframe for Storage network errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| storage_server_other_error_requests_message | Custom message for Storage server other errors monitor | string | `` | no |
| storage_server_other_error_requests_silenced | Groups to mute for Storage server other errors monitor | map | `<map>` | no |
| storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `90` | no |
| storage_server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `50` | no |
| storage_server_other_error_requests_time_aggregator | Monitor aggregator for Storage other errors [available values: min, max or avg] | string | `min` | no |
| storage_server_other_error_requests_timeframe | Monitor timeframe for Storage server other errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| storage_successful_requests_message | Custom message for Storage sucessful requests monitor | string | `` | no |
| storage_successful_requests_silenced | Groups to mute for Storage sucessful requests monitor | map | `<map>` | no |
| storage_successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `10` | no |
| storage_successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `30` | no |
| storage_successful_requests_time_aggregator | Monitor aggregator for Storage sucessful requests [available values: min, max or avg] | string | `max` | no |
| storage_successful_requests_timeframe | Monitor timeframe for Storage sucessful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| storage_throttling_error_requests_message | Custom message for Storage throttling error monitor | string | `` | no |
| storage_throttling_error_requests_silenced | Groups to mute for Storage throttling error monitor | map | `<map>` | no |
| storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `90` | no |
| storage_throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `50` | no |
| storage_throttling_error_requests_time_aggregator | Monitor aggregator for Storage throttling errors [available values: min, max or avg] | string | `min` | no |
| storage_throttling_error_requests_timeframe | Monitor timeframe for Storage throttling errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| storage_timeout_error_requests_message | Custom message for Storage timeout monitor | string | `` | no |
| storage_timeout_error_requests_silenced | Groups to mute for Storage timeout monitor | map | `<map>` | no |
| storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `90` | no |
| storage_timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `50` | no |
| storage_timeout_error_requests_time_aggregator | Monitor aggregator for Storage timeout [available values: min, max or avg] | string | `min` | no |
| storage_timeout_error_requests_timeframe | Monitor timeframe for Storage timeout [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| streamanalytics_conversion_errors_message | Custom message for Stream Analytics conversion errors monitor | string | `` | no |
| streamanalytics_conversion_errors_silenced | Groups to mute for Stream Analytics conversion errors monitor | map | `<map>` | no |
| streamanalytics_conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no |
| streamanalytics_conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no |
| streamanalytics_conversion_errors_time_aggregator | Monitor aggregator for Stream Analytics conversion errors [available values: min, max or avg] | string | `min` | no |
| streamanalytics_conversion_errors_timeframe | Monitor timeframe for Stream Analytics conversion errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| streamanalytics_failed_function_requests_message | Custom message for Stream Analytics failed requests monitor | string | `` | no |
| streamanalytics_failed_function_requests_silenced | Groups to mute for Stream Analytics failed requests monitor | map | `<map>` | no |
| streamanalytics_failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no |
| streamanalytics_failed_function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no |
| streamanalytics_failed_function_requests_timeframe | Monitor timeframe for Stream Analytics failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| streamanalytics_runtime_errors_message | Custom message for Stream Analytics runtime errors monitor | string | `` | no |
| streamanalytics_runtime_errors_silenced | Groups to mute for Stream Analytics runtime errors monitor | map | `<map>` | no |
| streamanalytics_runtime_errors_threshold_critical | Runtime errors limit (critical threshold) | string | `10` | no |
| streamanalytics_runtime_errors_threshold_warning | Runtime errors limit (warning threshold) | string | `0` | no |
| streamanalytics_runtime_errors_time_aggregator | Monitor aggregator for Stream Analytics runtime errors [available values: min, max or avg] | string | `min` | no |
| streamanalytics_runtime_errors_timeframe | Monitor timeframe for Stream Analytics runtime errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| streamanalytics_status_message | Custom message for Stream Analytics status monitor | string | `` | no |
| streamanalytics_status_silenced | Groups to mute for Stream Analytics status monitor | map | `<map>` | no |
| streamanalytics_status_time_aggregator | Monitor aggregator for Stream Analytics status [available values: min, max or avg] | string | `max` | no |
| streamanalytics_status_timeframe | Monitor timeframe for Stream Analytics status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| streamanalytics_su_utilization_message | Custom message for Stream Analytics utilization monitor | string | `` | no |
| streamanalytics_su_utilization_silenced | Groups to mute for Stream Analytics utilization monitor | map | `<map>` | no |
| streamanalytics_su_utilization_threshold_critical | Streaming Unit utilization rate limit (critical threshold) | string | `80` | no |
| streamanalytics_su_utilization_threshold_warning | Streaming Unit utilization rate limit (warning threshold) | string | `60` | no |
| streamanalytics_su_utilization_time_aggregator | Monitor aggregator for Stream Analytics utilization [available values: min, max or avg] | string | `min` | no |
| streamanalytics_su_utilization_timeframe | Monitor timeframe for Stream Analytics utilization [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
Related documentation
---------------------
@ -230,3 +305,4 @@ Related documentation
DataDog documentation: [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/)
Azure metrics documentation: [https://docs.microsoft.com/en-us/azure/monitoring-and-diagnostics/monitoring-overview-metrics](https://docs.microsoft.com/en-us/azure/monitoring-and-diagnostics/monitoring-overview-metrics)

Binary file not shown.

View File

@ -37,6 +37,12 @@ variable "status_message" {
default = ""
}
variable "status_time_aggregator" {
description = "Monitor aggregator for API Management status [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "status_timeframe" {
description = "Monitor timeframe for API Management status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -13,8 +13,8 @@ resource "datadog_monitor" "apimgt_status" {
message = "${coalesce(var.status_message, var.message)}"
query = <<EOF
avg(${var.status_timeframe}):avg:azure.apimanagement_service.status{${data.template_file.filter.rendered}} by {resource_group,region,name} < 1
EOF
${var.status_time_aggregator}(${var.status_timeframe}):avg:azure.apimanagement_service.status{${data.template_file.filter.rendered}} by {resource_group,region,name} < 1
EOF
type = "metric alert"
@ -46,7 +46,7 @@ resource "datadog_monitor" "apimgt_failed_requests" {
avg:azure.apimanagement_service.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() * 100
) > ${var.failed_requests_threshold_critical}
EOF
EOF
thresholds {
critical = "${var.failed_requests_threshold_critical}"
@ -78,7 +78,7 @@ resource "datadog_monitor" "apimgt_other_requests" {
avg:azure.apimanagement_service.other_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() * 100
) > ${var.other_requests_threshold_critical}
EOF
EOF
thresholds {
critical = "${var.other_requests_threshold_critical}"
@ -110,7 +110,7 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" {
avg:azure.apimanagement_service.unauthorized_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() * 100
) > ${var.unauthorized_requests_threshold_critical}
EOF
EOF
thresholds {
critical = "${var.unauthorized_requests_threshold_critical}"
@ -142,7 +142,7 @@ resource "datadog_monitor" "apimgt_successful_requests" {
avg:azure.apimanagement_service.successful_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() /
avg:azure.apimanagement_service.total_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() * 100
) < ${var.successful_requests_threshold_critical}
EOF
EOF
thresholds {
critical = "${var.successful_requests_threshold_critical}"

Binary file not shown.

View File

@ -35,6 +35,12 @@ variable "response_time_message" {
default = ""
}
variable "response_time_time_aggregator" {
description = "Monitor aggregator for App Services response time [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "response_time_timeframe" {
description = "Monitor timeframe for App Services response time [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -63,6 +69,12 @@ variable "memory_usage_message" {
default = ""
}
variable "memory_usage_time_aggregator" {
description = "Monitor aggregator for App Services memory usage [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "memory_usage_timeframe" {
description = "Monitor timeframe for App Services memory usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -13,7 +13,7 @@ resource "datadog_monitor" "appservices_response_time" {
message = "${coalesce(var.response_time_message, var.message)}"
query = <<EOF
min(last_5m): (
${var.response_time_time_aggregator}(${var.response_time_timeframe}): (
avg:azure.app_services.average_response_time{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.response_time_threshold_critical}
EOF
@ -44,7 +44,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
message = "${coalesce(var.memory_usage_message, var.message)}"
query = <<EOF
avg(${var.memory_usage_timeframe}): (
${var.memory_usage_time_aggregator}(${var.memory_usage_timeframe}): (
avg:azure.app_services.memory_working_set{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.memory_usage_threshold_critical}
EOF

Binary file not shown.

View File

@ -37,6 +37,12 @@ variable "status_message" {
default = ""
}
variable "status_time_aggregator" {
description = "Monitor aggregator for Event Hub status [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "status_timeframe" {
description = "Monitor timeframe for Event Hub status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -11,8 +11,10 @@ resource "datadog_monitor" "eventhub_status" {
message = "${coalesce(var.status_message, var.message)}"
query = <<EOF
avg(${var.status_timeframe}): avg:azure.eventhub_namespaces.status{${data.template_file.filter.rendered}} by {resource_group,region,name} != 1
EOF
${var.status_time_aggregator}(${var.status_timeframe}): (
avg:azure.eventhub_namespaces.status{${data.template_file.filter.rendered}} by {resource_group,region,name}
) != 1
EOF
type = "metric alert"
@ -42,7 +44,7 @@ resource "datadog_monitor" "eventhub_failed_requests" {
avg:azure.eventhub_namespaces.incoming_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count(),
0) * 100
) > ${var.failed_requests_rate_thresold_critical}
EOF
EOF
type = "metric alert"
@ -82,7 +84,7 @@ resource "datadog_monitor" "eventhub_errors" {
),
0) * 100
) > ${var.errors_rate_thresold_critical}
EOF
EOF
type = "metric alert"

View File

@ -41,6 +41,18 @@ variable "apimanagement_status_message" {
default = ""
}
variable "apimanagement_status_time_aggregator" {
description = "Monitor aggregator for API Management status [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "apimanagement_status_timeframe" {
description = "Monitor timeframe for API Management status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "apimanagement_failed_requests_silenced" {
description = "Groups to mute for API Management failed requests monitor"
type = "map"
@ -53,6 +65,12 @@ variable "apimanagement_failed_requests_message" {
default = ""
}
variable "apimanagement_failed_requests_timeframe" {
description = "Monitor timeframe for API Management failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "apimanagement_failed_requests_threshold_critical" {
description = "Maximum acceptable percent of failed requests"
default = 90
@ -75,6 +93,12 @@ variable "apimanagement_other_requests_message" {
default = ""
}
variable "apimanagement_other_requests_timeframe" {
description = "Monitor timeframe for API Management other requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "apimanagement_other_requests_threshold_critical" {
description = "Maximum acceptable percent of other requests"
default = 90
@ -97,6 +121,12 @@ variable "apimanagement_unauthorized_requests_message" {
default = ""
}
variable "apimanagement_unauthorized_requests_timeframe" {
description = "Monitor timeframe for API Management unauthorized requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "apimanagement_unauthorized_requests_threshold_critical" {
description = "Maximum acceptable percent of unauthorized requests"
default = 90
@ -119,6 +149,12 @@ variable "apimanagement_successful_requests_message" {
default = ""
}
variable "apimanagement_successful_requests_timeframe" {
description = "Monitor timeframe for API Management successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "apimanagement_successful_requests_threshold_critical" {
description = "Minimum acceptable percent of successful requests"
default = 10
@ -142,6 +178,18 @@ variable "appservices_response_time_message" {
default = ""
}
variable "appservices_response_time_time_aggregator" {
description = "Monitor aggregator for App Services response time [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "appservices_response_time_timeframe" {
description = "Monitor timeframe for App Services response time [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "appservices_response_time_threshold_critical" {
default = 10
description = "Alerting threshold for response time in seconds"
@ -164,6 +212,18 @@ variable "appservices_memory_usage_message" {
default = ""
}
variable "appservices_memory_usage_time_aggregator" {
description = "Monitor aggregator for App Services memory usage [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "appservices_memory_usage_timeframe" {
description = "Monitor timeframe for App Services memory usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "appservices_memory_usage_threshold_critical" {
default = 1073741824 # 1Gb
description = "Alerting threshold in Mib"
@ -186,6 +246,12 @@ variable "appservices_http_4xx_requests_message" {
default = ""
}
variable "appservices_http_4xx_requests_timeframe" {
description = "Monitor timeframe for App Services 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "appservices_http_4xx_requests_threshold_critical" {
default = 90
description = "Maximum critical acceptable percent of 4xx errors"
@ -208,6 +274,12 @@ variable "appservices_http_5xx_requests_message" {
default = ""
}
variable "appservices_http_5xx_requests_timeframe" {
description = "Monitor timeframe for App Services 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "appservices_http_5xx_requests_threshold_critical" {
default = 90
description = "Maximum critical acceptable percent of 5xx errors"
@ -230,6 +302,12 @@ variable "appservices_http_successful_requests_message" {
default = ""
}
variable "appservices_http_successful_requests_timeframe" {
description = "Monitor timeframe for App Services successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "appservices_http_successful_requests_threshold_critical" {
default = 10
description = "Minimum critical acceptable percent of 2xx & 3xx requests"
@ -253,6 +331,18 @@ variable "eventhub_status_message" {
default = ""
}
variable "eventhub_status_time_aggregator" {
description = "Monitor aggregator for Event Hub status [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "eventhub_status_timeframe" {
description = "Monitor timeframe for Event Hub status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "eventhub_failed_requests_rate_silenced" {
description = "Groups to mute for Event Hub failed requests monitor"
type = "map"
@ -265,6 +355,12 @@ variable "eventhub_failed_requests_rate_message" {
default = ""
}
variable "eventhub_failed_requests_rate_timeframe" {
description = "Monitor timeframe for Event Hub failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "eventhub_failed_requests_rate_thresold_critical" {
description = "Failed requests ratio (percentage) to trigger the critical alert"
default = 90
@ -287,6 +383,12 @@ variable "eventhub_errors_rate_message" {
default = ""
}
variable "eventhub_errors_rate_timeframe" {
description = "Monitor timeframe for Event Hub errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "eventhub_errors_rate_thresold_critical" {
description = "Errors ratio (percentage) to trigger the critical alert"
default = 90
@ -310,6 +412,18 @@ variable "iothub_status_message" {
default = ""
}
variable "iothub_status_time_aggregator" {
description = "Monitor aggregator for IoT Hub status [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "iothub_status_timeframe" {
description = "Monitor timeframe for IoT Hub status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "iothub_total_devices_silenced" {
description = "Groups to mute for IoT Hub total devices monitor"
type = "map"
@ -322,6 +436,18 @@ variable "iothub_total_devices_message" {
default = ""
}
variable "iothub_total_devices_time_aggregator" {
description = "Monitor aggregator for IoT Hub total devices [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "iothub_total_devices_timeframe" {
description = "Monitor timeframe for IoT Hub total devices [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "iothub_too_many_d2c_telemetry_ingress_nosent_silenced" {
description = "Groups to mute for IoT Hub unsent d2c telemetry monitor"
type = "map"
@ -334,6 +460,12 @@ variable "iothub_too_many_d2c_telemetry_ingress_nosent_message" {
default = ""
}
variable "iothub_too_many_d2c_telemetry_ingress_nosent_timeframe" {
description = "Monitor timeframe for IoT Hub unsent d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "iothub_failed_jobs_rate_silenced" {
description = "Groups to mute for IoT Hub failed jobs monitor"
type = "map"
@ -346,6 +478,12 @@ variable "iothub_failed_jobs_rate_message" {
default = ""
}
variable "iothub_failed_jobs_rate_timeframe" {
description = "Monitor timeframe for IoT Hub failed jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "iothub_failed_jobs_rate_threshold_warning" {
description = "Jobs Failed rate limit (warning threshold)"
default = 50
@ -368,6 +506,12 @@ variable "iothub_failed_listjobs_rate_message" {
default = ""
}
variable "iothub_failed_listjobs_rate_timeframe" {
description = "Monitor timeframe for IoT Hub failed list jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "iothub_failed_listjobs_rate_threshold_warning" {
description = "ListJobs Failed rate limit (warning threshold)"
default = 50
@ -390,6 +534,12 @@ variable "iothub_failed_queryjobs_rate_message" {
default = ""
}
variable "iothub_failed_queryjobs_rate_timeframe" {
description = "Monitor timeframe for IoT Hub failed query jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "iothub_failed_queryjobs_rate_threshold_warning" {
description = "QueryJobs Failed rate limit (warning threshold)"
default = 50
@ -412,6 +562,12 @@ variable "iothub_failed_c2d_methods_rate_message" {
default = ""
}
variable "iothub_failed_c2d_methods_rate_timeframe" {
description = "Monitor timeframe for IoT Hub failed c2d method [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "iothub_failed_c2d_methods_rate_threshold_warning" {
description = "C2D Methods Failed rate limit (warning threshold)"
default = 50
@ -434,6 +590,12 @@ variable "iothub_failed_c2d_twin_read_rate_message" {
default = ""
}
variable "iothub_failed_c2d_twin_read_rate_timeframe" {
description = "Monitor timeframe for IoT Hub failed c2d twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "iothub_failed_c2d_twin_read_rate_threshold_warning" {
description = "C2D Twin Read Failed rate limit (warning threshold)"
default = 50
@ -456,6 +618,12 @@ variable "iothub_failed_c2d_twin_update_rate_message" {
default = ""
}
variable "iothub_failed_c2d_twin_update_rate_timeframe" {
description = "Monitor timeframe for IoT Hub failed c2d twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "iothub_failed_c2d_twin_update_rate_threshold_warning" {
description = "C2D Twin Update Failed rate limit (warning threshold)"
default = 50
@ -478,6 +646,12 @@ variable "iothub_failed_d2c_twin_read_rate_message" {
default = ""
}
variable "iothub_failed_d2c_twin_read_rate_timeframe" {
description = "Monitor timeframe for IoT Hub failed d2c twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "iothub_failed_d2c_twin_read_rate_threshold_warning" {
description = "D2C Twin Read Failed rate limit (warning threshold)"
default = 50
@ -500,6 +674,12 @@ variable "iothub_failed_d2c_twin_update_rate_message" {
default = ""
}
variable "iothub_failed_d2c_twin_update_rate_timeframe" {
description = "Monitor timeframe for IoT Hub failed d2c twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "iothub_failed_d2c_twin_update_rate_threshold_warning" {
description = "D2C Twin Update Failed rate limit (warning threshold)"
default = 50
@ -522,6 +702,12 @@ variable "iothub_dropped_d2c_telemetry_egress_message" {
default = ""
}
variable "iothub_dropped_d2c_telemetry_egress_timeframe" {
description = "Monitor timeframe for IoT Hub dropped d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "iothub_dropped_d2c_telemetry_egress_rate_threshold_warning" {
description = "D2C Telemetry Dropped limit (warning threshold)"
default = 50
@ -544,6 +730,12 @@ variable "iothub_orphaned_d2c_telemetry_egress_message" {
default = ""
}
variable "iothub_orphaned_d2c_telemetry_egress_timeframe" {
description = "Monitor timeframe for IoT Hub orphaned d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "iothub_orphaned_d2c_telemetry_egress_rate_threshold_warning" {
description = "D2C Telemetry Orphaned limit (warning threshold)"
default = 50
@ -566,6 +758,12 @@ variable "iothub_invalid_d2c_telemetry_egress_message" {
default = ""
}
variable "iothub_invalid_d2c_telemetry_egress_timeframe" {
description = "Monitor timeframe for IoT Hub invalid d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "iothub_invalid_d2c_telemetry_egress_rate_threshold_warning" {
description = "D2C Telemetry Invalid limit (warning threshold)"
default = 50
@ -589,6 +787,18 @@ variable "redis_status_message" {
default = ""
}
variable "redis_status_time_aggregator" {
description = "Monitor aggregator for Redis status [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "redis_status_timeframe" {
description = "Monitor timeframe for Redis status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "redis_evictedkeys_limit_silenced" {
description = "Groups to mute for Redis evicted keys monitor"
type = "map"
@ -601,6 +811,18 @@ variable "redis_evictedkeys_limit_message" {
default = ""
}
variable "redis_evictedkeys_limit_time_aggregator" {
description = "Monitor aggregator for Redis evicted keys [available values: min, max or avg]"
type = "string"
default = "avg"
}
variable "redis_evictedkeys_limit_timeframe" {
description = "Monitor timeframe for Redis evicted keys [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "redis_evictedkeys_limit_threshold_warning" {
description = "Evicted keys limit (warning threshold)"
default = 0
@ -623,6 +845,18 @@ variable "redis_percent_processor_time_message" {
default = ""
}
variable "redis_percent_processor_time_time_aggregator" {
description = "Monitor aggregator for Redis processor [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "redis_percent_processor_time_timeframe" {
description = "Monitor timeframe for Redis processor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "redis_percent_processor_time_threshold_critical" {
description = "Processor time percent (critical threshold)"
default = 80
@ -645,6 +879,18 @@ variable "redis_server_load_rate_message" {
default = ""
}
variable "redis_server_load_rate_time_aggregator" {
description = "Monitor aggregator for Redis server load [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "redis_server_load_rate_timeframe" {
description = "Monitor timeframe for Redis server load [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "redis_server_load_rate_threshold_critical" {
description = "Server CPU load rate (critical threshold)"
default = 90
@ -668,10 +914,10 @@ variable "servicebus_status_message" {
default = ""
}
variable "servicebus_status_aggregator" {
description = "Monitor aggregator for Service Bus status [available values: min, max, sum or avg]"
variable "servicebus_status_time_aggregator" {
description = "Monitor aggregator for Service Bus status [available values: min, max or avg]"
type = "string"
default = "min"
default = "max"
}
variable "servicebus_status_timeframe" {
@ -692,6 +938,18 @@ variable "sqldatabase_cpu_message" {
default = ""
}
variable "sqldatabase_cpu_time_aggregator" {
description = "Monitor aggregator for SQL CPU [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "sqldatabase_cpu_timeframe" {
description = "Monitor timeframe for SQL CPU [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_15m"
}
variable "sqldatabase_cpu_threshold_warning" {
description = "CPU usage in percent (warning threshold)"
default = "80"
@ -714,6 +972,18 @@ variable "sqldatabase_diskspace_message" {
default = ""
}
variable "sqldatabase_diskspace_time_aggregator" {
description = "Monitor aggregator for SQL disk space [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "sqldatabase_diskspace_timeframe" {
description = "Monitor timeframe for SQL disk space [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_15m"
}
variable "sqldatabase_diskspace_threshold_warning" {
description = "Disk space used in percent (warning threshold)"
default = "80"
@ -736,6 +1006,18 @@ variable "sqldatabase_dtu_message" {
default = ""
}
variable "sqldatabase_dtu_time_aggregator" {
description = "Monitor aggregator for SQL DTU [available values: min, max or avg]"
type = "string"
default = "avg"
}
variable "sqldatabase_dtu_timeframe" {
description = "Monitor timeframe for SQL DTU [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_15m"
}
variable "sqldatabase_dtu_threshold_warning" {
description = "Amount of DTU used (warning threshold)"
default = "85"
@ -758,6 +1040,12 @@ variable "sqldatabase_deadlock_message" {
default = ""
}
variable "sqldatabase_deadlock_timeframe" {
description = "Monitor timeframe for SQL Deadlock [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "sqldatabase_deadlock_threshold_critical" {
description = "Amount of Deadlocks (critical threshold)"
default = "1"
@ -776,6 +1064,18 @@ variable "storage_availability_message" {
default = ""
}
variable "storage_availability_time_aggregator" {
description = "Monitor aggregator for Storage availability [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "storage_availability_timeframe" {
description = "Monitor timeframe for Storage availability [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "storage_availability_threshold_critical" {
description = "Minimum acceptable percent of availability for a storage"
default = 50
@ -798,6 +1098,18 @@ variable "storage_successful_requests_message" {
default = ""
}
variable "storage_successful_requests_time_aggregator" {
description = "Monitor aggregator for Storage sucessful requests [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "storage_successful_requests_timeframe" {
description = "Monitor timeframe for Storage sucessful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "storage_successful_requests_threshold_critical" {
description = "Minimum acceptable percent of successful requests for a storage"
default = 10
@ -820,6 +1132,18 @@ variable "storage_latency_message" {
default = ""
}
variable "storage_latency_time_aggregator" {
description = "Monitor aggregator for Storage latency [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "storage_latency_timeframe" {
description = "Monitor timeframe for Storage latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "storage_latency_threshold_critical" {
description = "Maximum acceptable end to end latency (ms) for a storage"
default = 2000
@ -842,6 +1166,18 @@ variable "storage_timeout_error_requests_message" {
default = ""
}
variable "storage_timeout_error_requests_time_aggregator" {
description = "Monitor aggregator for Storage timeout [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "storage_timeout_error_requests_timeframe" {
description = "Monitor timeframe for Storage timeout [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "storage_timeout_error_requests_threshold_critical" {
description = "Maximum acceptable percent of timeout error requests for a storage"
default = 90
@ -864,6 +1200,18 @@ variable "storage_network_error_requests_message" {
default = ""
}
variable "storage_network_error_requests_time_aggregator" {
description = "Monitor aggregator for Storage network errors [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "storage_network_error_requests_timeframe" {
description = "Monitor timeframe for Storage network errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "storage_network_error_requests_threshold_critical" {
description = "Maximum acceptable percent of network error requests for a storage"
default = 90
@ -886,6 +1234,18 @@ variable "storage_throttling_error_requests_message" {
default = ""
}
variable "storage_throttling_error_requests_time_aggregator" {
description = "Monitor aggregator for Storage throttling errors [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "storage_throttling_error_requests_timeframe" {
description = "Monitor timeframe for Storage throttling errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "storage_throttling_error_requests_threshold_critical" {
description = "Maximum acceptable percent of throttling error requests for a storage"
default = 90
@ -908,6 +1268,18 @@ variable "storage_server_other_error_requests_message" {
default = ""
}
variable "storage_server_other_error_requests_time_aggregator" {
description = "Monitor aggregator for Storage other errors [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "storage_server_other_error_requests_timeframe" {
description = "Monitor timeframe for Storage server other errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "storage_server_other_error_requests_threshold_critical" {
description = "Maximum acceptable percent of server other error requests for a storage"
default = 90
@ -930,6 +1302,18 @@ variable "storage_client_other_error_requests_message" {
default = ""
}
variable "storage_client_other_error_requests_time_aggregator" {
description = "Monitor aggregator for Storage other errors [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "storage_client_other_error_requests_timeframe" {
description = "Monitor timeframe for Storage other errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "storage_client_other_error_requests_threshold_critical" {
description = "Maximum acceptable percent of client other error requests for a storage"
default = 90
@ -952,6 +1336,18 @@ variable "storage_authorization_error_requests_message" {
default = ""
}
variable "storage_authorization_error_requests_time_aggregator" {
description = "Monitor aggregator for Storage authorization errors [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "storage_authorization_error_requests_timeframe" {
description = "Monitor timeframe for Storage authorization errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "storage_authorization_error_requests_threshold_critical" {
description = "Maximum acceptable percent of authorization error requests for a storage"
default = 90
@ -975,6 +1371,18 @@ variable "streamanalytics_status_message" {
default = ""
}
variable "streamanalytics_status_time_aggregator" {
description = "Monitor aggregator for Stream Analytics status [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "streamanalytics_status_timeframe" {
description = "Monitor timeframe for Stream Analytics status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "streamanalytics_su_utilization_silenced" {
description = "Groups to mute for Stream Analytics utilization monitor"
type = "map"
@ -987,6 +1395,18 @@ variable "streamanalytics_su_utilization_message" {
default = ""
}
variable "streamanalytics_su_utilization_time_aggregator" {
description = "Monitor aggregator for Stream Analytics utilization [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "streamanalytics_su_utilization_timeframe" {
description = "Monitor timeframe for Stream Analytics utilization [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "streamanalytics_su_utilization_threshold_warning" {
description = "Streaming Unit utilization rate limit (warning threshold)"
default = 60
@ -1009,6 +1429,12 @@ variable "streamanalytics_failed_function_requests_message" {
default = ""
}
variable "streamanalytics_failed_function_requests_timeframe" {
description = "Monitor timeframe for Stream Analytics failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "streamanalytics_failed_function_requests_threshold_warning" {
description = "Failed Function Request rate limit (warning threshold)"
default = 0
@ -1031,6 +1457,18 @@ variable "streamanalytics_conversion_errors_message" {
default = ""
}
variable "streamanalytics_conversion_errors_time_aggregator" {
description = "Monitor aggregator for Stream Analytics conversion errors [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "streamanalytics_conversion_errors_timeframe" {
description = "Monitor timeframe for Stream Analytics conversion errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "streamanalytics_conversion_errors_threshold_warning" {
description = "Conversion errors limit (warning threshold)"
default = 0
@ -1053,6 +1491,18 @@ variable "streamanalytics_runtime_errors_message" {
default = ""
}
variable "streamanalytics_runtime_errors_time_aggregator" {
description = "Monitor aggregator for Stream Analytics runtime errors [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "streamanalytics_runtime_errors_timeframe" {
description = "Monitor timeframe for Stream Analytics runtime errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "streamanalytics_runtime_errors_threshold_warning" {
description = "Runtime errors limit (warning threshold)"
default = 0

View File

@ -33,83 +33,86 @@ Creates a DataDog monitors with the following checks :
* D2C telemetry egress fallback count check
* D2C telemetry ingress no sent count check
Inputs
------
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
| dropped_d2c_telemetry_egress_message | Custom message for IoT Hub dropped d2c telemetry monitor | string | `` | no |
| dropped_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `90` | no |
| dropped_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `50` | no |
| dropped_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub dropped d2c telemetry monitor | map | `<map>` | no |
| dropped_d2c_telemetry_egress_timeframe | Monitor timeframe for IoT Hub dropped d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| environment | Architecture Environment | string | - | yes |
| failed_c2d_methods_rate_message | Custom message for IoT Hub failed c2d method monitor | string | `` | no |
| failed_c2d_methods_rate_silenced | Groups to mute for IoT Hub failed c2d methods monitor | map | `<map>` | no |
| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `90` | no |
| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `50` | no |
| failed_c2d_methods_rate_timeframe | Monitor timeframe for IoT Hub failed c2d method [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| failed_c2d_twin_read_rate_message | Custom message for IoT Hub failed c2d twin read monitor | string | `` | no |
| failed_c2d_twin_read_rate_silenced | Groups to mute for IoT Hub failed c2d twin read monitor | map | `<map>` | no |
| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `90` | no |
| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `50` | no |
| failed_c2d_twin_read_rate_timeframe | Monitor timeframe for IoT Hub failed c2d twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| failed_c2d_twin_update_rate_message | Custom message for IoT Hub failed c2d twin update monitor | string | `` | no |
| failed_c2d_twin_update_rate_silenced | Groups to mute for IoT Hub failed c2d twin update monitor | map | `<map>` | no |
| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `90` | no |
| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `50` | no |
| failed_c2d_twin_update_rate_timeframe | Monitor timeframe for IoT Hub failed c2d twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| failed_d2c_twin_read_rate_message | Custom message for IoT Hub failed d2c twin read monitor | string | `` | no |
| failed_d2c_twin_read_rate_silenced | Groups to mute for IoT Hub failed d2c twin read monitor | map | `<map>` | no |
| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `90` | no |
| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `50` | no |
| failed_d2c_twin_read_rate_timeframe | Monitor timeframe for IoT Hub failed d2c twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| failed_d2c_twin_update_rate_message | Custom message for IoT Hub failed d2c twin update monitor | string | `` | no |
| failed_d2c_twin_update_rate_silenced | Groups to mute for IoT Hub failed d2c twin update monitor | map | `<map>` | no |
| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `90` | no |
| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `50` | no |
| failed_d2c_twin_update_rate_timeframe | Monitor timeframe for IoT Hub failed d2c twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| failed_jobs_rate_message | Custom message for IoT Hub failed jobs monitor | string | `` | no |
| failed_jobs_rate_silenced | Groups to mute for IoT Hub failed jobs monitor | map | `<map>` | no |
| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `90` | no |
| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `50` | no |
| failed_jobs_rate_timeframe | Monitor timeframe for IoT Hub failed jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| failed_listjobs_rate_message | Custom message for IoT Hub failed list jobs monitor | string | `` | no |
| failed_listjobs_rate_silenced | Groups to mute for IoT Hub failed list jobs monitor | map | `<map>` | no |
| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `90` | no |
| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `50` | no |
| failed_listjobs_rate_timeframe | Monitor timeframe for IoT Hub failed list jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| failed_queryjobs_rate_message | Custom message for IoT Hub failed query jobs monitor | string | `` | no |
| failed_queryjobs_rate_silenced | Groups to mute for IoT Hub failed query jobs monitor | map | `<map>` | no |
| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `90` | no |
| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no |
| failed_queryjobs_rate_timeframe | Monitor timeframe for IoT Hub failed query jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| filter_tags | Tags used for filtering | string | `*` | no |
| invalid_d2c_telemetry_egress_message | Custom message for IoT Hub invalid d2c telemetry monitor | string | `` | no |
| invalid_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `90` | no |
| invalid_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `50` | no |
| invalid_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub invalid d2c telemetry monitor | map | `<map>` | no |
| invalid_d2c_telemetry_egress_timeframe | Monitor timeframe for IoT Hub invalid d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| message | Message sent when an alert is triggered | string | - | yes |
| orphaned_d2c_telemetry_egress_message | Custom message for IoT Hub orphaned d2c telemetry monitor | string | `` | no |
| orphaned_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `90` | no |
| orphaned_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `50` | no |
| orphaned_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub orphaned d2c telemetry monitor | map | `<map>` | no |
| orphaned_d2c_telemetry_egress_timeframe | Monitor timeframe for IoT Hub orphaned d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| status_message | Custom message for IoT Hub status monitor | string | `` | no |
| status_silenced | Groups to mute for IoT Hub status monitor | map | `<map>` | no |
| status_timeframe | Monitor timeframe for IoT Hub status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| too_many_d2c_telemetry_ingress_nosent_message | Custom message for IoT Hub unsent d2c telemetry monitor | string | `` | no |
| too_many_d2c_telemetry_ingress_nosent_silenced | Groups to mute for IoT Hub unsent d2c telemetry monitor | map | `<map>` | no |
| too_many_d2c_telemetry_ingress_nosent_timeframe | Monitor timeframe for IoT Hub unsent d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| total_devices_message | Custom message for IoT Hub total devices monitor | string | `` | no |
| total_devices_silenced | Groups to mute for IoT Hub total devices monitor | map | `<map>` | no |
| total_devices_timeframe | Monitor timeframe for IoT Hub total devices [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
Inputs
------
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
| dropped_d2c_telemetry_egress_message | Custom message for IoT Hub dropped d2c telemetry monitor | string | `` | no |
| dropped_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `90` | no |
| dropped_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `50` | no |
| dropped_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub dropped d2c telemetry monitor | map | `<map>` | no |
| dropped_d2c_telemetry_egress_timeframe | Monitor timeframe for IoT Hub dropped d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| environment | Architecture Environment | string | - | yes |
| failed_c2d_methods_rate_message | Custom message for IoT Hub failed c2d method monitor | string | `` | no |
| failed_c2d_methods_rate_silenced | Groups to mute for IoT Hub failed c2d methods monitor | map | `<map>` | no |
| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `90` | no |
| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `50` | no |
| failed_c2d_methods_rate_timeframe | Monitor timeframe for IoT Hub failed c2d method [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| failed_c2d_twin_read_rate_message | Custom message for IoT Hub failed c2d twin read monitor | string | `` | no |
| failed_c2d_twin_read_rate_silenced | Groups to mute for IoT Hub failed c2d twin read monitor | map | `<map>` | no |
| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `90` | no |
| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `50` | no |
| failed_c2d_twin_read_rate_timeframe | Monitor timeframe for IoT Hub failed c2d twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| failed_c2d_twin_update_rate_message | Custom message for IoT Hub failed c2d twin update monitor | string | `` | no |
| failed_c2d_twin_update_rate_silenced | Groups to mute for IoT Hub failed c2d twin update monitor | map | `<map>` | no |
| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `90` | no |
| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `50` | no |
| failed_c2d_twin_update_rate_timeframe | Monitor timeframe for IoT Hub failed c2d twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| failed_d2c_twin_read_rate_message | Custom message for IoT Hub failed d2c twin read monitor | string | `` | no |
| failed_d2c_twin_read_rate_silenced | Groups to mute for IoT Hub failed d2c twin read monitor | map | `<map>` | no |
| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `90` | no |
| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `50` | no |
| failed_d2c_twin_read_rate_timeframe | Monitor timeframe for IoT Hub failed d2c twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| failed_d2c_twin_update_rate_message | Custom message for IoT Hub failed d2c twin update monitor | string | `` | no |
| failed_d2c_twin_update_rate_silenced | Groups to mute for IoT Hub failed d2c twin update monitor | map | `<map>` | no |
| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `90` | no |
| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `50` | no |
| failed_d2c_twin_update_rate_timeframe | Monitor timeframe for IoT Hub failed d2c twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| failed_jobs_rate_message | Custom message for IoT Hub failed jobs monitor | string | `` | no |
| failed_jobs_rate_silenced | Groups to mute for IoT Hub failed jobs monitor | map | `<map>` | no |
| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `90` | no |
| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `50` | no |
| failed_jobs_rate_timeframe | Monitor timeframe for IoT Hub failed jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| failed_listjobs_rate_message | Custom message for IoT Hub failed list jobs monitor | string | `` | no |
| failed_listjobs_rate_silenced | Groups to mute for IoT Hub failed list jobs monitor | map | `<map>` | no |
| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `90` | no |
| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `50` | no |
| failed_listjobs_rate_timeframe | Monitor timeframe for IoT Hub failed list jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| failed_queryjobs_rate_message | Custom message for IoT Hub failed query jobs monitor | string | `` | no |
| failed_queryjobs_rate_silenced | Groups to mute for IoT Hub failed query jobs monitor | map | `<map>` | no |
| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `90` | no |
| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no |
| failed_queryjobs_rate_timeframe | Monitor timeframe for IoT Hub failed query jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| filter_tags | Tags used for filtering | string | `*` | no |
| invalid_d2c_telemetry_egress_message | Custom message for IoT Hub invalid d2c telemetry monitor | string | `` | no |
| invalid_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `90` | no |
| invalid_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `50` | no |
| invalid_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub invalid d2c telemetry monitor | map | `<map>` | no |
| invalid_d2c_telemetry_egress_timeframe | Monitor timeframe for IoT Hub invalid d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| message | Message sent when an alert is triggered | string | - | yes |
| orphaned_d2c_telemetry_egress_message | Custom message for IoT Hub orphaned d2c telemetry monitor | string | `` | no |
| orphaned_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `90` | no |
| orphaned_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `50` | no |
| orphaned_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub orphaned d2c telemetry monitor | map | `<map>` | no |
| orphaned_d2c_telemetry_egress_timeframe | Monitor timeframe for IoT Hub orphaned d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| status_message | Custom message for IoT Hub status monitor | string | `` | no |
| status_silenced | Groups to mute for IoT Hub status monitor | map | `<map>` | no |
| status_time_aggregator | Monitor aggregator for IoT Hub status [available values: min, max, sum or avg] | string | `max` | no |
| status_timeframe | Monitor timeframe for IoT Hub status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| too_many_d2c_telemetry_ingress_nosent_message | Custom message for IoT Hub unsent d2c telemetry monitor | string | `` | no |
| too_many_d2c_telemetry_ingress_nosent_silenced | Groups to mute for IoT Hub unsent d2c telemetry monitor | map | `<map>` | no |
| too_many_d2c_telemetry_ingress_nosent_timeframe | Monitor timeframe for IoT Hub unsent d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| total_devices_message | Custom message for IoT Hub total devices monitor | string | `` | no |
| total_devices_silenced | Groups to mute for IoT Hub total devices monitor | map | `<map>` | no |
| total_devices_time_aggregator | Monitor aggregator for IoT Hub total devices [available values: min, max, sum or avg] | string | `min` | no |
| total_devices_timeframe | Monitor timeframe for IoT Hub total devices [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
Related documentation
---------------------
DataDog documentation: [https://docs.datadoghq.com/integrations/azure_iot_hub](https://docs.datadoghq.com/integrations/azure_iot_hub)
Azure IOT Hubs metrics documentation: [https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health](https://docs.microsoft.com/en-us/azure/iot-hub/iot-hub-monitor-resource-health)

View File

@ -32,6 +32,12 @@ variable "status_message" {
default = ""
}
variable "status_time_aggregator" {
description = "Monitor aggregator for IoT Hub status [available values: min, max, sum or avg]"
type = "string"
default = "max"
}
variable "status_timeframe" {
description = "Monitor timeframe for IoT Hub status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -50,6 +56,12 @@ variable "total_devices_message" {
default = ""
}
variable "total_devices_time_aggregator" {
description = "Monitor aggregator for IoT Hub total devices [available values: min, max, sum or avg]"
type = "string"
default = "min"
}
variable "total_devices_timeframe" {
description = "Monitor timeframe for IoT Hub total devices [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -105,7 +105,9 @@ resource "datadog_monitor" "status" {
message = "${coalesce(var.status_message, var.message)}"
query = <<EOF
avg(${var.status_timeframe}):avg:azure.devices_iothubs.status{${var.filter_tags}} by {resource_group,region,name} < 1
${var.status_time_aggregator}(${var.status_timeframe}): (
avg:azure.devices_iothubs.status{${var.filter_tags}} by {resource_group,region,name}
) < 1
EOF
type = "metric alert"
@ -130,7 +132,9 @@ resource "datadog_monitor" "total_devices" {
message = "${coalesce(var.total_devices_message, var.message)}"
query = <<EOF
avg(${var.total_devices_timeframe}):avg:azure.devices_iothubs.devices.total_devices{${var.filter_tags}} by {resource_group,region,name} == 0
${var.total_devices_time_aggregator}(${var.total_devices_timeframe}): (
avg:azure.devices_iothubs.devices.total_devices{${var.filter_tags}} by {resource_group,region,name}
) == 0
EOF
type = "metric alert"

View File

@ -10,20 +10,26 @@ module "apimanagement" {
status_silenced = "${var.apimanagement_status_silenced}"
status_message = "${var.apimanagement_status_message}"
status_time_aggregator = "${var.apimanagement_status_time_aggregator}"
status_timeframe = "${var.apimanagement_status_timeframe}"
failed_requests_silenced = "${var.apimanagement_failed_requests_silenced}"
failed_requests_message = "${var.apimanagement_failed_requests_message}"
failed_requests_timeframe = "${var.apimanagement_failed_requests_timeframe}"
failed_requests_threshold_critical = "${var.apimanagement_failed_requests_threshold_critical}"
failed_requests_threshold_warning = "${var.apimanagement_failed_requests_threshold_warning}"
other_requests_silenced = "${var.apimanagement_other_requests_silenced}"
other_requests_message = "${var.apimanagement_other_requests_message}"
other_requests_timeframe = "${var.apimanagement_other_requests_timeframe}"
other_requests_threshold_critical = "${var.apimanagement_other_requests_threshold_critical}"
other_requests_threshold_warning = "${var.apimanagement_other_requests_threshold_warning}"
successful_requests_silenced = "${var.apimanagement_successful_requests_silenced}"
successful_requests_message = "${var.apimanagement_successful_requests_message}"
successful_requests_timeframe = "${var.apimanagement_successful_requests_timeframe}"
successful_requests_threshold_critical = "${var.apimanagement_successful_requests_threshold_critical}"
successful_requests_threshold_warning = "${var.apimanagement_successful_requests_threshold_warning}"
unauthorized_requests_silenced = "${var.apimanagement_unauthorized_requests_silenced}"
unauthorized_requests_message = "${var.apimanagement_unauthorized_requests_message}"
unauthorized_requests_timeframe = "${var.apimanagement_unauthorized_requests_timeframe}"
unauthorized_requests_threshold_critical = "${var.apimanagement_unauthorized_requests_threshold_critical}"
unauthorized_requests_threshold_warning = "${var.apimanagement_unauthorized_requests_threshold_warning}"
}
@ -40,22 +46,29 @@ module "appservices" {
http_successful_requests_silenced = "${var.appservices_http_successful_requests_silenced}"
http_successful_requests_message = "${var.appservices_http_successful_requests_message}"
http_successful_requests_timeframe = "${var.appservices_http_successful_requests_timeframe}"
http_successful_requests_threshold_critical = "${var.appservices_http_successful_requests_threshold_critical}"
http_successful_requests_threshold_warning = "${var.appservices_http_successful_requests_threshold_warning}"
http_5xx_requests_silenced = "${var.appservices_http_5xx_requests_silenced}"
http_5xx_requests_message = "${var.appservices_http_5xx_requests_message}"
http_5xx_requests_timeframe = "${var.appservices_http_5xx_requests_timeframe}"
http_5xx_requests_threshold_critical = "${var.appservices_http_5xx_requests_threshold_critical}"
http_5xx_requests_threshold_warning = "${var.appservices_http_5xx_requests_threshold_warning}"
http_4xx_requests_silenced = "${var.appservices_http_4xx_requests_silenced}"
http_4xx_requests_message = "${var.appservices_http_4xx_requests_message}"
http_4xx_requests_timeframe = "${var.appservices_http_4xx_requests_timeframe}"
http_4xx_requests_threshold_critical = "${var.appservices_http_4xx_requests_threshold_critical}"
http_4xx_requests_threshold_warning = "${var.appservices_http_4xx_requests_threshold_warning}"
memory_usage_silenced = "${var.appservices_memory_usage_silenced}"
memory_usage_message = "${var.appservices_memory_usage_message}"
memory_usage_time_aggregator = "${var.appservices_memory_usage_time_aggregator}"
memory_usage_timeframe = "${var.appservices_memory_usage_timeframe}"
memory_usage_threshold_critical = "${var.appservices_memory_usage_threshold_critical}"
memory_usage_threshold_warning = "${var.appservices_memory_usage_threshold_warning}"
response_time_silenced = "${var.appservices_response_time_silenced}"
response_time_message = "${var.appservices_response_time_message}"
response_time_time_aggregator = "${var.appservices_response_time_time_aggregator}"
response_time_timeframe = "${var.appservices_response_time_timeframe}"
response_time_threshold_critical = "${var.appservices_response_time_threshold_critical}"
response_time_threshold_warning = "${var.appservices_response_time_threshold_warning}"
}
@ -72,12 +85,16 @@ module "eventhub" {
status_silenced = "${var.eventhub_status_silenced}"
status_message = "${var.eventhub_status_message}"
status_time_aggregator = "${var.eventhub_status_time_aggregator}"
status_timeframe = "${var.eventhub_status_timeframe}"
errors_rate_silenced = "${var.eventhub_errors_rate_silenced}"
errors_rate_message = "${var.eventhub_errors_rate_message}"
errors_rate_timeframe = "${var.eventhub_errors_rate_timeframe}"
errors_rate_thresold_critical = "${var.eventhub_errors_rate_thresold_critical}"
errors_rate_thresold_warning = "${var.eventhub_errors_rate_thresold_warning}"
failed_requests_rate_silenced = "${var.eventhub_failed_requests_rate_silenced}"
failed_requests_rate_message = "${var.eventhub_failed_requests_rate_message}"
failed_requests_rate_timeframe = "${var.eventhub_failed_requests_rate_timeframe}"
failed_requests_rate_thresold_critical = "${var.eventhub_failed_requests_rate_thresold_critical}"
failed_requests_rate_thresold_warning = "${var.eventhub_failed_requests_rate_thresold_warning}"
}
@ -93,51 +110,68 @@ module "iothub" {
status_silenced = "${var.iothub_status_silenced}"
status_message = "${var.iothub_status_message}"
status_time_aggregator = "${var.iothub_status_time_aggregator}"
status_timeframe = "${var.iothub_status_timeframe}"
total_devices_silenced = "${var.iothub_total_devices_silenced}"
total_devices_message = "${var.iothub_total_devices_message}"
total_devices_time_aggregator = "${var.iothub_total_devices_time_aggregator}"
total_devices_timeframe = "${var.iothub_total_devices_timeframe}"
too_many_d2c_telemetry_ingress_nosent_silenced = "${var.iothub_too_many_d2c_telemetry_ingress_nosent_silenced}"
too_many_d2c_telemetry_ingress_nosent_message = "${var.iothub_too_many_d2c_telemetry_ingress_nosent_message}"
too_many_d2c_telemetry_ingress_nosent_timeframe = "${var.iothub_too_many_d2c_telemetry_ingress_nosent_timeframe}"
dropped_d2c_telemetry_egress_silenced = "${var.iothub_dropped_d2c_telemetry_egress_silenced}"
dropped_d2c_telemetry_egress_message = "${var.iothub_dropped_d2c_telemetry_egress_message}"
dropped_d2c_telemetry_egress_timeframe = "${var.iothub_dropped_d2c_telemetry_egress_timeframe}"
dropped_d2c_telemetry_egress_rate_threshold_critical = "${var.iothub_dropped_d2c_telemetry_egress_rate_threshold_critical}"
dropped_d2c_telemetry_egress_rate_threshold_warning = "${var.iothub_dropped_d2c_telemetry_egress_rate_threshold_warning}"
failed_c2d_methods_rate_silenced = "${var.iothub_failed_c2d_methods_rate_silenced}"
failed_c2d_methods_rate_message = "${var.iothub_failed_c2d_methods_rate_message}"
failed_c2d_methods_rate_timeframe = "${var.iothub_failed_c2d_methods_rate_timeframe}"
failed_c2d_methods_rate_threshold_critical = "${var.iothub_failed_c2d_methods_rate_threshold_critical}"
failed_c2d_methods_rate_threshold_warning = "${var.iothub_failed_c2d_methods_rate_threshold_warning}"
failed_c2d_twin_read_rate_silenced = "${var.iothub_failed_c2d_twin_read_rate_silenced}"
failed_c2d_twin_read_rate_message = "${var.iothub_failed_c2d_twin_read_rate_message}"
failed_c2d_twin_read_rate_timeframe = "${var.iothub_failed_c2d_twin_read_rate_timeframe}"
failed_c2d_twin_read_rate_threshold_critical = "${var.iothub_failed_c2d_twin_read_rate_threshold_critical}"
failed_c2d_twin_read_rate_threshold_warning = "${var.iothub_failed_c2d_twin_read_rate_threshold_warning}"
failed_c2d_twin_update_rate_silenced = "${var.iothub_failed_c2d_twin_update_rate_silenced}"
failed_c2d_twin_update_rate_message = "${var.iothub_failed_c2d_twin_update_rate_message}"
failed_c2d_twin_update_rate_timeframe = "${var.iothub_failed_c2d_twin_update_rate_timeframe}"
failed_c2d_twin_update_rate_threshold_critical = "${var.iothub_failed_c2d_twin_update_rate_threshold_critical}"
failed_c2d_twin_update_rate_threshold_warning = "${var.iothub_failed_c2d_twin_update_rate_threshold_warning}"
failed_d2c_twin_read_rate_silenced = "${var.iothub_failed_d2c_twin_read_rate_silenced}"
failed_d2c_twin_read_rate_message = "${var.iothub_failed_d2c_twin_read_rate_message}"
failed_d2c_twin_read_rate_timeframe = "${var.iothub_failed_d2c_twin_read_rate_timeframe}"
failed_d2c_twin_read_rate_threshold_critical = "${var.iothub_failed_d2c_twin_read_rate_threshold_critical}"
failed_d2c_twin_read_rate_threshold_warning = "${var.iothub_failed_d2c_twin_read_rate_threshold_warning}"
failed_d2c_twin_update_rate_silenced = "${var.iothub_failed_d2c_twin_update_rate_silenced}"
failed_d2c_twin_update_rate_message = "${var.iothub_failed_d2c_twin_update_rate_message}"
failed_d2c_twin_update_rate_timeframe = "${var.iothub_failed_d2c_twin_update_rate_timeframe}"
failed_d2c_twin_update_rate_threshold_critical = "${var.iothub_failed_d2c_twin_update_rate_threshold_critical}"
failed_d2c_twin_update_rate_threshold_warning = "${var.iothub_failed_d2c_twin_update_rate_threshold_warning}"
failed_jobs_rate_silenced = "${var.iothub_failed_jobs_rate_silenced}"
failed_jobs_rate_message = "${var.iothub_failed_jobs_rate_message}"
failed_jobs_rate_timeframe = "${var.iothub_failed_jobs_rate_timeframe}"
failed_jobs_rate_threshold_critical = "${var.iothub_failed_jobs_rate_threshold_critical}"
failed_jobs_rate_threshold_warning = "${var.iothub_failed_jobs_rate_threshold_warning}"
failed_listjobs_rate_silenced = "${var.iothub_failed_listjobs_rate_silenced}"
failed_listjobs_rate_message = "${var.iothub_failed_listjobs_rate_message}"
failed_listjobs_rate_timeframe = "${var.iothub_failed_listjobs_rate_timeframe}"
failed_listjobs_rate_threshold_critical = "${var.iothub_failed_listjobs_rate_threshold_critical}"
failed_listjobs_rate_threshold_warning = "${var.iothub_failed_listjobs_rate_threshold_warning}"
failed_queryjobs_rate_silenced = "${var.iothub_failed_queryjobs_rate_silenced}"
failed_queryjobs_rate_message = "${var.iothub_failed_queryjobs_rate_message}"
failed_queryjobs_rate_timeframe = "${var.iothub_failed_queryjobs_rate_timeframe}"
failed_queryjobs_rate_threshold_critical = "${var.iothub_failed_queryjobs_rate_threshold_critical}"
failed_queryjobs_rate_threshold_warning = "${var.iothub_failed_queryjobs_rate_threshold_warning}"
invalid_d2c_telemetry_egress_silenced = "${var.iothub_invalid_d2c_telemetry_egress_silenced}"
invalid_d2c_telemetry_egress_message = "${var.iothub_invalid_d2c_telemetry_egress_message}"
invalid_d2c_telemetry_egress_timeframe = "${var.iothub_invalid_d2c_telemetry_egress_timeframe}"
invalid_d2c_telemetry_egress_rate_threshold_critical = "${var.iothub_invalid_d2c_telemetry_egress_rate_threshold_critical}"
invalid_d2c_telemetry_egress_rate_threshold_warning = "${var.iothub_invalid_d2c_telemetry_egress_rate_threshold_warning}"
orphaned_d2c_telemetry_egress_silenced = "${var.iothub_orphaned_d2c_telemetry_egress_silenced}"
orphaned_d2c_telemetry_egress_message = "${var.iothub_orphaned_d2c_telemetry_egress_message}"
orphaned_d2c_telemetry_egress_timeframe = "${var.iothub_orphaned_d2c_telemetry_egress_timeframe}"
orphaned_d2c_telemetry_egress_rate_threshold_critical = "${var.iothub_orphaned_d2c_telemetry_egress_rate_threshold_critical}"
orphaned_d2c_telemetry_egress_rate_threshold_warning = "${var.iothub_orphaned_d2c_telemetry_egress_rate_threshold_warning}"
}
@ -154,16 +188,24 @@ module "redis" {
status_silenced = "${var.redis_status_silenced}"
status_message = "${var.redis_status_message}"
status_time_aggregator = "${var.redis_status_time_aggregator}"
status_timeframe = "${var.redis_status_timeframe}"
evictedkeys_limit_silenced = "${var.redis_evictedkeys_limit_silenced}"
evictedkeys_limit_message = "${var.redis_evictedkeys_limit_message}"
evictedkeys_limit_time_aggregator = "${var.redis_evictedkeys_limit_time_aggregator}"
evictedkeys_limit_timeframe = "${var.redis_evictedkeys_limit_timeframe}"
evictedkeys_limit_threshold_critical = "${var.redis_evictedkeys_limit_threshold_critical}"
evictedkeys_limit_threshold_warning = "${var.redis_evictedkeys_limit_threshold_warning}"
percent_processor_time_silenced = "${var.redis_percent_processor_time_silenced}"
percent_processor_time_message = "${var.redis_percent_processor_time_message}"
percent_processor_time_time_aggregator = "${var.redis_percent_processor_time_time_aggregator}"
percent_processor_time_timeframe = "${var.redis_percent_processor_time_timeframe}"
percent_processor_time_threshold_critical = "${var.redis_percent_processor_time_threshold_critical}"
percent_processor_time_threshold_warning = "${var.redis_percent_processor_time_threshold_warning}"
server_load_rate_silenced = "${var.redis_server_load_rate_silenced}"
server_load_rate_message = "${var.redis_server_load_rate_message}"
server_load_rate_time_aggregator = "${var.redis_server_load_rate_time_aggregator}"
server_load_rate_timeframe = "${var.redis_server_load_rate_timeframe}"
server_load_rate_threshold_critical = "${var.redis_server_load_rate_threshold_critical}"
server_load_rate_threshold_warning = "${var.redis_server_load_rate_threshold_warning}"
}
@ -178,10 +220,10 @@ module "servicebus" {
filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
filter_tags_custom = "${var.filter_tags_custom}"
status_silenced = "${var.servicebus_status_silenced}"
status_message = "${var.servicebus_status_message}"
status_timeframe = "${var.servicebus_status_timeframe}"
status_timeframe_aggregator = "${var.servicebus_status_aggregator}"
status_silenced = "${var.servicebus_status_silenced}"
status_message = "${var.servicebus_status_message}"
status_timeframe = "${var.servicebus_status_timeframe}"
status_time_aggregator = "${var.servicebus_status_time_aggregator}"
}
module "sqldatabase" {
@ -196,17 +238,24 @@ module "sqldatabase" {
cpu_silenced = "${var.sqldatabase_cpu_silenced}"
cpu_message = "${var.sqldatabase_cpu_message}"
cpu_time_aggregator = "${var.sqldatabase_cpu_time_aggregator}"
cpu_timeframe = "${var.sqldatabase_cpu_timeframe}"
cpu_threshold_critical = "${var.sqldatabase_cpu_threshold_critical}"
cpu_threshold_warning = "${var.sqldatabase_cpu_threshold_warning}"
deadlock_silenced = "${var.sqldatabase_deadlock_silenced}"
deadlock_message = "${var.sqldatabase_deadlock_message}"
deadlock_timeframe = "${var.sqldatabase_deadlock_timeframe}"
deadlock_threshold_critical = "${var.sqldatabase_deadlock_threshold_critical}"
diskspace_silenced = "${var.sqldatabase_diskspace_silenced}"
diskspace_message = "${var.sqldatabase_diskspace_message}"
diskspace_time_aggregator = "${var.sqldatabase_diskspace_time_aggregator}"
diskspace_timeframe = "${var.sqldatabase_diskspace_timeframe}"
diskspace_threshold_critical = "${var.sqldatabase_diskspace_threshold_critical}"
diskspace_threshold_warning = "${var.sqldatabase_diskspace_threshold_warning}"
dtu_silenced = "${var.sqldatabase_dtu_silenced}"
dtu_message = "${var.sqldatabase_dtu_message}"
dtu_time_aggregator = "${var.sqldatabase_dtu_time_aggregator}"
dtu_timeframe = "${var.sqldatabase_dtu_timeframe}"
dtu_threshold_critical = "${var.sqldatabase_dtu_threshold_critical}"
dtu_threshold_warning = "${var.sqldatabase_dtu_threshold_warning}"
}
@ -223,38 +272,56 @@ module "storage" {
authorization_error_requests_silenced = "${var.storage_authorization_error_requests_silenced}"
authorization_error_requests_message = "${var.storage_authorization_error_requests_message}"
authorization_error_requests_time_aggregator = "${var.storage_authorization_error_requests_time_aggregator}"
authorization_error_requests_timeframe = "${var.storage_authorization_error_requests_timeframe}"
authorization_error_requests_threshold_critical = "${var.storage_authorization_error_requests_threshold_critical}"
authorization_error_requests_threshold_warning = "${var.storage_authorization_error_requests_threshold_warning}"
availability_silenced = "${var.storage_availability_silenced}"
availability_message = "${var.storage_availability_message}"
availability_time_aggregator = "${var.storage_availability_time_aggregator}"
availability_timeframe = "${var.storage_availability_timeframe}"
availability_threshold_critical = "${var.storage_availability_threshold_critical}"
availability_threshold_warning = "${var.storage_availability_threshold_warning}"
client_other_error_requests_silenced = "${var.storage_client_other_error_requests_silenced}"
client_other_error_requests_message = "${var.storage_client_other_error_requests_message}"
client_other_error_requests_time_aggregator = "${var.storage_client_other_error_requests_time_aggregator}"
client_other_error_requests_timeframe = "${var.storage_client_other_error_requests_timeframe}"
client_other_error_requests_threshold_critical = "${var.storage_client_other_error_requests_threshold_critical}"
client_other_error_requests_threshold_warning = "${var.storage_client_other_error_requests_threshold_warning}"
latency_silenced = "${var.storage_latency_silenced}"
latency_message = "${var.storage_latency_message}"
latency_time_aggregator = "${var.storage_latency_time_aggregator}"
latency_timeframe = "${var.storage_latency_timeframe}"
latency_threshold_critical = "${var.storage_latency_threshold_critical}"
latency_threshold_warning = "${var.storage_latency_threshold_warning}"
network_error_requests_silenced = "${var.storage_network_error_requests_silenced}"
network_error_requests_message = "${var.storage_network_error_requests_message}"
network_error_requests_time_aggregator = "${var.storage_network_error_requests_time_aggregator}"
network_error_requests_timeframe = "${var.storage_network_error_requests_timeframe}"
network_error_requests_threshold_critical = "${var.storage_network_error_requests_threshold_critical}"
network_error_requests_threshold_warning = "${var.storage_network_error_requests_threshold_warning}"
server_other_error_requests_silenced = "${var.storage_server_other_error_requests_silenced}"
server_other_error_requests_message = "${var.storage_server_other_error_requests_message}"
server_other_error_requests_time_aggregator = "${var.storage_server_other_error_requests_time_aggregator}"
server_other_error_requests_timeframe = "${var.storage_server_other_error_requests_timeframe}"
server_other_error_requests_threshold_critical = "${var.storage_server_other_error_requests_threshold_critical}"
server_other_error_requests_threshold_warning = "${var.storage_server_other_error_requests_threshold_warning}"
successful_requests_silenced = "${var.storage_successful_requests_silenced}"
successful_requests_message = "${var.storage_successful_requests_message}"
successful_requests_time_aggregator = "${var.storage_successful_requests_time_aggregator}"
successful_requests_timeframe = "${var.storage_successful_requests_timeframe}"
successful_requests_threshold_critical = "${var.storage_successful_requests_threshold_critical}"
successful_requests_threshold_warning = "${var.storage_successful_requests_threshold_warning}"
throttling_error_requests_silenced = "${var.storage_throttling_error_requests_silenced}"
throttling_error_requests_message = "${var.storage_throttling_error_requests_message}"
throttling_error_requests_time_aggregator = "${var.storage_throttling_error_requests_time_aggregator}"
throttling_error_requests_timeframe = "${var.storage_throttling_error_requests_timeframe}"
throttling_error_requests_threshold_critical = "${var.storage_throttling_error_requests_threshold_critical}"
throttling_error_requests_threshold_warning = "${var.storage_throttling_error_requests_threshold_warning}"
timeout_error_requests_silenced = "${var.storage_timeout_error_requests_silenced}"
timeout_error_requests_message = "${var.storage_timeout_error_requests_message}"
timeout_error_requests_time_aggregator = "${var.storage_timeout_error_requests_time_aggregator}"
timeout_error_requests_timeframe = "${var.storage_timeout_error_requests_timeframe}"
timeout_error_requests_threshold_critical = "${var.storage_timeout_error_requests_threshold_critical}"
timeout_error_requests_threshold_warning = "${var.storage_timeout_error_requests_threshold_warning}"
}
@ -271,18 +338,25 @@ module "streamanalytics" {
conversion_errors_silenced = "${var.streamanalytics_conversion_errors_silenced}"
conversion_errors_message = "${var.streamanalytics_conversion_errors_message}"
conversion_errors_time_aggregator = "${var.streamanalytics_conversion_errors_time_aggregator}"
conversion_errors_timeframe = "${var.streamanalytics_conversion_errors_timeframe}"
conversion_errors_threshold_critical = "${var.streamanalytics_conversion_errors_threshold_critical}"
conversion_errors_threshold_warning = "${var.streamanalytics_conversion_errors_threshold_warning}"
failed_function_requests_silenced = "${var.streamanalytics_failed_function_requests_silenced}"
failed_function_requests_message = "${var.streamanalytics_failed_function_requests_message}"
failed_function_requests_timeframe = "${var.streamanalytics_failed_function_requests_timeframe}"
failed_function_requests_threshold_critical = "${var.streamanalytics_failed_function_requests_threshold_critical}"
failed_function_requests_threshold_warning = "${var.streamanalytics_failed_function_requests_threshold_warning}"
runtime_errors_silenced = "${var.streamanalytics_runtime_errors_silenced}"
runtime_errors_message = "${var.streamanalytics_runtime_errors_message}"
runtime_errors_time_aggregator = "${var.streamanalytics_runtime_errors_time_aggregator}"
runtime_errors_timeframe = "${var.streamanalytics_runtime_errors_timeframe}"
runtime_errors_threshold_critical = "${var.streamanalytics_runtime_errors_threshold_critical}"
runtime_errors_threshold_warning = "${var.streamanalytics_runtime_errors_threshold_warning}"
su_utilization_silenced = "${var.streamanalytics_su_utilization_silenced}"
su_utilization_message = "${var.streamanalytics_su_utilization_message}"
su_utilization_time_aggregator = "${var.streamanalytics_su_utilization_time_aggregator}"
su_utilization_timeframe = "${var.streamanalytics_su_utilization_timeframe}"
su_utilization_threshold_critical = "${var.streamanalytics_su_utilization_threshold_critical}"
su_utilization_threshold_warning = "${var.streamanalytics_su_utilization_threshold_warning}"
}

Binary file not shown.

View File

@ -37,6 +37,12 @@ variable "status_message" {
default = ""
}
variable "status_time_aggregator" {
description = "Monitor aggregator for Redis status [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "status_timeframe" {
description = "Monitor timeframe for Redis status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -55,6 +61,12 @@ variable "evictedkeys_limit_message" {
default = ""
}
variable "evictedkeys_limit_time_aggregator" {
description = "Monitor aggregator for Redis evicted keys [available values: min, max or avg]"
type = "string"
default = "avg"
}
variable "evictedkeys_limit_timeframe" {
description = "Monitor timeframe for Redis evicted keys [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -83,6 +95,12 @@ variable "percent_processor_time_message" {
default = ""
}
variable "percent_processor_time_time_aggregator" {
description = "Monitor aggregator for Redis processor [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "percent_processor_time_timeframe" {
description = "Monitor timeframe for Redis processor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -111,6 +129,12 @@ variable "server_load_rate_message" {
default = ""
}
variable "server_load_rate_time_aggregator" {
description = "Monitor aggregator for Redis server load [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "server_load_rate_timeframe" {
description = "Monitor timeframe for Redis server load [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -11,8 +11,10 @@ resource "datadog_monitor" "status" {
message = "${coalesce(var.status_message, var.message)}"
query = <<EOF
avg(${var.status_timeframe}):avg:azure.cache_redis.status{${data.template_file.filter.rendered}} by {resource_group,region,name} != 1
EOF
${var.status_time_aggregator}(${var.status_timeframe}): (
avg:azure.cache_redis.status{${data.template_file.filter.rendered}} by {resource_group,region,name}
) != 1
EOF
type = "metric alert"
@ -36,7 +38,7 @@ resource "datadog_monitor" "evictedkeys" {
message = "${coalesce(var.evictedkeys_limit_message, var.message)}"
query = <<EOF
avg(${var.evictedkeys_limit_timeframe}): (
${var.evictedkeys_limit_time_aggregator}(${var.evictedkeys_limit_timeframe}): (
avg:azure.cache_redis.evictedkeys{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.evictedkeys_limit_threshold_critical}
EOF
@ -68,7 +70,7 @@ resource "datadog_monitor" "percent_processor_time" {
message = "${coalesce(var.percent_processor_time_message, var.message)}"
query = <<EOF
avg(${var.percent_processor_time_timeframe}): (
${var.percent_processor_time_time_aggregator}(${var.percent_processor_time_timeframe}): (
avg:azure.cache_redis.percent_processor_time{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.percent_processor_time_threshold_critical}
EOF
@ -100,7 +102,7 @@ resource "datadog_monitor" "server_load" {
message = "${coalesce(var.server_load_rate_message, var.message)}"
query = <<EOF
avg(${var.server_load_rate_timeframe}): (
${var.server_load_rate_time_aggregator}(${var.server_load_rate_timeframe}): (
avg:azure.cache_redis.server_load{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.server_load_rate_threshold_critical}
EOF

Binary file not shown.

View File

@ -37,10 +37,10 @@ variable "status_message" {
default = ""
}
variable "status_aggregator" {
description = "Monitor aggregator for Service Bus status [available values: min, max, sum or avg]"
variable "status_time_aggregator" {
description = "Monitor aggregator for Service Bus status [available values: min, max or avg]"
type = "string"
default = "min"
default = "max"
}
variable "status_timeframe" {

View File

@ -11,8 +11,8 @@ resource "datadog_monitor" "servicebus_status" {
message = "${coalesce(var.status_message, var.message)}"
query = <<EOF
${var.status_aggregator}(${var.status_timeframe}): (
${var.status_aggregator}:azure.servicebus_namespaces.status{${data.template_file.filter.rendered}} by {resource_group,region,name}
${var.status_time_aggregator}(${var.status_timeframe}): (
avg:azure.servicebus_namespaces.status{${data.template_file.filter.rendered}} by {resource_group,region,name}
) != 1
EOF

Binary file not shown.

View File

@ -37,6 +37,12 @@ variable "cpu_message" {
default = ""
}
variable "cpu_time_aggregator" {
description = "Monitor aggregator for SQL CPU [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "cpu_timeframe" {
description = "Monitor timeframe for SQL CPU [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -65,6 +71,12 @@ variable "diskspace_message" {
default = ""
}
variable "diskspace_time_aggregator" {
description = "Monitor aggregator for SQL disk space [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "diskspace_timeframe" {
description = "Monitor timeframe for SQL disk space [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -93,6 +105,12 @@ variable "dtu_message" {
default = ""
}
variable "dtu_time_aggregator" {
description = "Monitor aggregator for SQL DTU [available values: min, max or avg]"
type = "string"
default = "avg"
}
variable "dtu_timeframe" {
description = "Monitor timeframe for SQL DTU [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -11,7 +11,7 @@ resource "datadog_monitor" "sql-database_cpu_90_15min" {
message = "${coalesce(var.cpu_message, var.message)}"
query = <<EOF
avg(${var.cpu_timeframe}): (
${var.cpu_time_aggregator}(${var.cpu_timeframe}): (
avg:azure.sql_servers_databases.cpu_percent{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.cpu_threshold_critical}
EOF
@ -44,7 +44,7 @@ resource "datadog_monitor" "sql-database_free_space_low" {
type = "metric alert"
query = <<EOF
avg(${var.diskspace_timeframe}): (
${var.diskspace_time_aggregator}(${var.diskspace_timeframe}): (
avg:azure.sql_servers_databases.storage_percent{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.diskspace_threshold_critical}
EOF
@ -76,7 +76,7 @@ resource "datadog_monitor" "sql-database_dtu_consumption_high" {
type = "metric alert"
query = <<EOF
avg(${var.dtu_timeframe}): (
${var.dtu_time_aggregator}(${var.dtu_timeframe}): (
azure.sql_servers_databases.dtu_consumption_percent{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.dtu_threshold_critical}
EOF

View File

@ -36,16 +36,19 @@ Inputs
| authorization_error_requests_silenced | Groups to mute for Storage authorization errors monitor | map | `<map>` | no |
| authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `90` | no |
| authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `50` | no |
| authorization_error_requests_time_aggregator | Monitor aggregator for Storage authorization errors [available values: min, max or avg] | string | `min` | no |
| authorization_error_requests_timeframe | Monitor timeframe for Storage authorization errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| availability_message | Custom message for Storage availability monitor | string | `` | no |
| availability_silenced | Groups to mute for Storage availability monitor | map | `<map>` | no |
| availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `50` | no |
| availability_threshold_warning | Warning regarding acceptable percent of availability for a storage | string | `90` | no |
| availability_time_aggregator | Monitor aggregator for Storage availability [available values: min, max or avg] | string | `max` | no |
| availability_timeframe | Monitor timeframe for Storage availability [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| client_other_error_requests_message | Custom message for Storage other errors monitor | string | `` | no |
| client_other_error_requests_silenced | Groups to mute for Storage other errors monitor | map | `<map>` | no |
| client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `90` | no |
| client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `50` | no |
| client_other_error_requests_time_aggregator | Monitor aggregator for Storage other errors [available values: min, max or avg] | string | `min` | no |
| client_other_error_requests_timeframe | Monitor timeframe for Storage other errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
| environment | Architecture environment | string | - | yes |
@ -55,32 +58,38 @@ Inputs
| latency_silenced | Groups to mute for Storage latency monitor | map | `<map>` | no |
| latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `2000` | no |
| latency_threshold_warning | Warning regarding acceptable end to end latency (ms) for a storage | string | `1000` | no |
| latency_time_aggregator | Monitor aggregator for Storage latency [available values: min, max or avg] | string | `min` | no |
| latency_timeframe | Monitor timeframe for Storage latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| message | Message sent when a Redis monitor is triggered | string | - | yes |
| network_error_requests_message | Custom message for Storage network errors monitor | string | `` | no |
| network_error_requests_silenced | Groups to mute for Storage network errors monitor | map | `<map>` | no |
| network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `90` | no |
| network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `50` | no |
| network_error_requests_time_aggregator | Monitor aggregator for Storage network errors [available values: min, max or avg] | string | `min` | no |
| network_error_requests_timeframe | Monitor timeframe for Storage network errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| server_other_error_requests_message | Custom message for Storage server other errors monitor | string | `` | no |
| server_other_error_requests_silenced | Groups to mute for Storage server other errors monitor | map | `<map>` | no |
| server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `90` | no |
| server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `50` | no |
| server_other_error_requests_time_aggregator | Monitor aggregator for Storage other errors [available values: min, max or avg] | string | `min` | no |
| server_other_error_requests_timeframe | Monitor timeframe for Storage server other errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| successful_requests_message | Custom message for Storage sucessful requests monitor | string | `` | no |
| successful_requests_silenced | Groups to mute for Storage sucessful requests monitor | map | `<map>` | no |
| successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `10` | no |
| successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `30` | no |
| successful_requests_time_aggregator | Monitor aggregator for Storage sucessful requests [available values: min, max or avg] | string | `max` | no |
| successful_requests_timeframe | Monitor timeframe for Storage sucessful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| throttling_error_requests_message | Custom message for Storage throttling error monitor | string | `` | no |
| throttling_error_requests_silenced | Groups to mute for Storage throttling error monitor | map | `<map>` | no |
| throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `90` | no |
| throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `50` | no |
| throttling_error_requests_time_aggregator | Monitor aggregator for Storage throttling errors [available values: min, max or avg] | string | `min` | no |
| throttling_error_requests_timeframe | Monitor timeframe for Storage throttling errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| timeout_error_requests_message | Custom message for Storage timeout monitor | string | `` | no |
| timeout_error_requests_silenced | Groups to mute for Storage timeout monitor | map | `<map>` | no |
| timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `90` | no |
| timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `50` | no |
| timeout_error_requests_time_aggregator | Monitor aggregator for Storage timeout [available values: min, max or avg] | string | `min` | no |
| timeout_error_requests_timeframe | Monitor timeframe for Storage timeout [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
Related documentation
@ -93,3 +102,4 @@ DataDog blog: [https://www.datadoghq.com/blog/monitor-azure-storage-datadog/](ht
Azure Storage metrics documentation: [https://docs.microsoft.com/en-us/azure/storage/common/storage-monitor-storage-account](https://docs.microsoft.com/en-us/azure/storage/common/storage-monitor-storage-account)
Azure Storage metrics detailed documentation [https://docs.microsoft.com/en-us/rest/api/storageservices/storage-analytics-metrics-table-schema](https://docs.microsoft.com/en-us/rest/api/storageservices/storage-analytics-metrics-table-schema)

View File

@ -37,6 +37,12 @@ variable "availability_message" {
default = ""
}
variable "availability_time_aggregator" {
description = "Monitor aggregator for Storage availability [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "availability_timeframe" {
description = "Monitor timeframe for Storage availability [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -65,6 +71,12 @@ variable "successful_requests_message" {
default = ""
}
variable "successful_requests_time_aggregator" {
description = "Monitor aggregator for Storage sucessful requests [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "successful_requests_timeframe" {
description = "Monitor timeframe for Storage sucessful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -93,6 +105,12 @@ variable "latency_message" {
default = ""
}
variable "latency_time_aggregator" {
description = "Monitor aggregator for Storage latency [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "latency_timeframe" {
description = "Monitor timeframe for Storage latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -121,6 +139,12 @@ variable "timeout_error_requests_message" {
default = ""
}
variable "timeout_error_requests_time_aggregator" {
description = "Monitor aggregator for Storage timeout [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "timeout_error_requests_timeframe" {
description = "Monitor timeframe for Storage timeout [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -149,6 +173,12 @@ variable "network_error_requests_message" {
default = ""
}
variable "network_error_requests_time_aggregator" {
description = "Monitor aggregator for Storage network errors [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "network_error_requests_timeframe" {
description = "Monitor timeframe for Storage network errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -177,6 +207,12 @@ variable "throttling_error_requests_message" {
default = ""
}
variable "throttling_error_requests_time_aggregator" {
description = "Monitor aggregator for Storage throttling errors [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "throttling_error_requests_timeframe" {
description = "Monitor timeframe for Storage throttling errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -205,6 +241,12 @@ variable "server_other_error_requests_message" {
default = ""
}
variable "server_other_error_requests_time_aggregator" {
description = "Monitor aggregator for Storage other errors [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "server_other_error_requests_timeframe" {
description = "Monitor timeframe for Storage server other errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -233,6 +275,12 @@ variable "client_other_error_requests_message" {
default = ""
}
variable "client_other_error_requests_time_aggregator" {
description = "Monitor aggregator for Storage other errors [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "client_other_error_requests_timeframe" {
description = "Monitor timeframe for Storage other errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -261,6 +309,12 @@ variable "authorization_error_requests_message" {
default = ""
}
variable "authorization_error_requests_time_aggregator" {
description = "Monitor aggregator for Storage authorization errors [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "authorization_error_requests_timeframe" {
description = "Monitor timeframe for Storage authorization errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -11,7 +11,7 @@ resource "datadog_monitor" "availability" {
message = "${coalesce(var.availability_message, var.message)}"
query = <<EOF
avg(${var.availability_timeframe}): (default(
${var.availability_time_aggregator}(${var.availability_timeframe}): (default(
avg:azure.storage.availability{${data.template_file.filter.rendered},transaction_type:all} by {resource_group,storage_type,name},
100)) < ${var.availability_threshold_critical}
EOF
@ -42,7 +42,7 @@ resource "datadog_monitor" "successful_requests" {
message = "${coalesce(var.successful_requests_message, var.message)}"
query = <<EOF
avg(${var.successful_requests_timeframe}): (default(
${var.successful_requests_time_aggregator}(${var.successful_requests_timeframe}): (default(
avg:azure.storage.percent_success{${data.template_file.filter.rendered},transaction_type:all} by {resource_group,storage_type,name},
100)) < ${var.successful_requests_threshold_critical}
EOF
@ -73,7 +73,7 @@ resource "datadog_monitor" "latency" {
message = "${coalesce(var.latency_message, var.message)}"
query = <<EOF
min(${var.latency_timeframe}): (default(
${var.latency_time_aggregator}(${var.latency_timeframe}): (default(
avg:azure.storage.average_e2_e_latency{${data.template_file.filter.rendered},transaction_type:all} by {resource_group,storage_type,name},
0)) > ${var.latency_threshold_critical}
EOF
@ -104,7 +104,7 @@ resource "datadog_monitor" "timeout_error_requests" {
message = "${coalesce(var.timeout_error_requests_message, var.message)}"
query = <<EOF
avg(${var.timeout_error_requests_timeframe}): (default(
${var.timeout_error_requests_time_aggregator}(${var.timeout_error_requests_timeframe}): (default(
avg:azure.storage.percent_timeout_error{${data.template_file.filter.rendered},transaction_type:all} by {resource_group,storage_type,name},
0)) > ${var.timeout_error_requests_threshold_critical}
EOF
@ -135,7 +135,7 @@ resource "datadog_monitor" "network_error_requests" {
message = "${coalesce(var.network_error_requests_message, var.message)}"
query = <<EOF
avg(${var.network_error_requests_timeframe}): (default(
${var.network_error_requests_time_aggregator}(${var.network_error_requests_timeframe}): (default(
avg:azure.storage.percent_network_error{${data.template_file.filter.rendered},transaction_type:all} by {resource_group,storage_type,name},
0)) > ${var.network_error_requests_threshold_critical}
EOF
@ -166,7 +166,7 @@ resource "datadog_monitor" "throttling_error_requests" {
message = "${coalesce(var.throttling_error_requests_message, var.message)}"
query = <<EOF
avg(${var.throttling_error_requests_timeframe}): (default(
${var.throttling_error_requests_time_aggregator}(${var.throttling_error_requests_timeframe}): (default(
avg:azure.storage.percent_throttling_error{${data.template_file.filter.rendered},transaction_type:all} by {resource_group,storage_type,name},
0)) > ${var.throttling_error_requests_threshold_critical}
EOF
@ -197,7 +197,7 @@ resource "datadog_monitor" "server_other_error_requests" {
message = "${coalesce(var.server_other_error_requests_message, var.message)}"
query = <<EOF
avg(${var.server_other_error_requests_timeframe}): (default(
${var.server_other_error_requests_time_aggregator}(${var.server_other_error_requests_timeframe}): (default(
avg:azure.storage.percent_server_other_error{${data.template_file.filter.rendered},transaction_type:all} by {resource_group,storage_type,name},
0)) > ${var.server_other_error_requests_threshold_critical}
EOF
@ -228,7 +228,7 @@ resource "datadog_monitor" "client_other_error_requests" {
message = "${coalesce(var.client_other_error_requests_message, var.message)}"
query = <<EOF
avg(${var.client_other_error_requests_timeframe}): (default(
${var.client_other_error_requests_time_aggregator}(${var.client_other_error_requests_timeframe}): (default(
avg:azure.storage.percent_client_other_error{${data.template_file.filter.rendered},transaction_type:all} by {resource_group,storage_type,name},
0)) > ${var.client_other_error_requests_threshold_critical}
EOF
@ -259,7 +259,7 @@ resource "datadog_monitor" "authorization_error_requests" {
message = "${coalesce(var.authorization_error_requests_message, var.message)}"
query = <<EOF
avg(${var.authorization_error_requests_timeframe}): (default(
${var.authorization_error_requests_time_aggregator}(${var.authorization_error_requests_timeframe}): (default(
avg:azure.storage.percent_authorization_error{${data.template_file.filter.rendered},transaction_type:all} by {resource_group,storage_type,name},
0)) > ${var.authorization_error_requests_threshold_critical}
EOF

View File

@ -37,6 +37,12 @@ variable "status_message" {
default = ""
}
variable "status_time_aggregator" {
description = "Monitor aggregator for Stream Analytics status [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "status_timeframe" {
description = "Monitor timeframe for Stream Analytics status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -55,6 +61,12 @@ variable "su_utilization_message" {
default = ""
}
variable "su_utilization_time_aggregator" {
description = "Monitor aggregator for Stream Analytics utilization [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "su_utilization_timeframe" {
description = "Monitor timeframe for Stream Analytics utilization [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -111,6 +123,12 @@ variable "conversion_errors_message" {
default = ""
}
variable "conversion_errors_time_aggregator" {
description = "Monitor aggregator for Stream Analytics conversion errors [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "conversion_errors_timeframe" {
description = "Monitor timeframe for Stream Analytics conversion errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -139,6 +157,12 @@ variable "runtime_errors_message" {
default = ""
}
variable "runtime_errors_time_aggregator" {
description = "Monitor aggregator for Stream Analytics runtime errors [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "runtime_errors_timeframe" {
description = "Monitor timeframe for Stream Analytics runtime errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -11,7 +11,9 @@ resource "datadog_monitor" "status" {
message = "${coalesce(var.status_message, var.message)}"
query = <<EOF
avg(${var.status_timeframe}):avg:azure.streamanalytics_streamingjobs.status{${data.template_file.filter.rendered}} by {resource_group,region,name} < 1
${var.status_time_aggregator}(${var.status_timeframe}): (
avg:azure.streamanalytics_streamingjobs.status{${data.template_file.filter.rendered}} by {resource_group,region,name}
) < 1
EOF
type = "metric alert"
@ -36,7 +38,7 @@ resource "datadog_monitor" "su_utilization" {
message = "${coalesce(var.su_utilization_message, var.message)}"
query = <<EOF
avg(${var.su_utilization_timeframe}): (
${var.su_utilization_time_aggregator}(${var.su_utilization_timeframe}): (
avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.su_utilization_threshold_critical}
EOF
@ -101,7 +103,7 @@ resource "datadog_monitor" "conversion_errors" {
message = "${coalesce(var.conversion_errors_message, var.message)}"
query = <<EOF
avg(${var.conversion_errors_timeframe}): (
${var.conversion_errors_time_aggregator}(${var.conversion_errors_timeframe}): (
avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.conversion_errors_threshold_critical}
EOF
@ -133,7 +135,7 @@ resource "datadog_monitor" "runtime_errors" {
message = "${coalesce(var.runtime_errors_message, var.message)}"
query = <<EOF
avg(${var.runtime_errors_timeframe}): (
${var.runtime_errors_time_aggregator}(${var.runtime_errors_timeframe}): (
avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.runtime_errors_threshold_critical}
EOF

Binary file not shown.

View File

@ -1,3 +1,4 @@
output "alerting-message" {
value = "${data.template_file.alerting-message.rendered}"
description = "The generated message string"
value = "${data.template_file.alerting-message.rendered}"
}

View File

@ -81,4 +81,6 @@ Inputs
| message | Message sent when an alert is triggered | string | - | yes |
| mongodb_replicaset_message | Custom message for Mongodb replicaset monitor | string | `` | no |
| mongodb_replicaset_silenced | Groups to mute for Mongodb replicaset monitor | map | `<map>` | no |
| mongodb_replicaset_time_aggregator | Monitor aggregator for Mongodb replicaset [available values: min, max or avg] | string | `max` | no |
| mongodb_replicaset_timeframe | Monitor timeframe for Mongodb replicaset [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |

View File

@ -36,6 +36,12 @@ variable "mongodb_replicaset_message" {
default = ""
}
variable "mongodb_replicaset_time_aggregator" {
description = "Monitor aggregator for Mongodb replicaset [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "mongodb_replicaset_timeframe" {
description = "Monitor timeframe for Mongodb replicaset [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -11,7 +11,7 @@ resource "datadog_monitor" "mongodb_replicaset_state" {
message = "${coalesce(var.mongodb_replicaset_message, var.message)}"
query = <<EOF
avg(${var.mongodb_replicaset_timeframe}): (
${var.mongodb_replicaset_time_aggregator}(${var.mongodb_replicaset_timeframe}): (
avg:mongodb.replset.health{${data.template_file.filter.rendered}} by {region,replset_name}
) < 1
EOF

View File

@ -19,15 +19,16 @@ Creates a DataDog monitors with the following checks :
* Apache connect
Inputs
------
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| apache_connect_message | Custom message for Apache process monitor | string | `` | no |
| apache_connect_silenced | Groups to mute for Apache process monitor | map | `<map>` | no |
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
| environment | Architecture Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| message | Message sent when an alert is triggered | string | - | yes |
Inputs
------
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| apache_connect_message | Custom message for Apache process monitor | string | `` | no |
| apache_connect_silenced | Groups to mute for Apache process monitor | map | `<map>` | no |
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
| environment | Architecture Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| message | Message sent when an alert is triggered | string | - | yes |

View File

@ -19,15 +19,16 @@ Creates a DataDog monitors with the following checks :
* Nginx connect
Inputs
------
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| environment | Architecture Environment | string | - | yes |
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| message | Message sent when an alert is triggered | string | - | yes |
| nginx_connect_message | Custom message for Nginx process monitor | string | `` | no |
| nginx_connect_silenced | Groups to mute for Nginx process monitor | map | `<map>` | no |
Inputs
------
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
| environment | Architecture Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| message | Message sent when an alert is triggered | string | - | yes |
| nginx_connect_message | Custom message for Nginx process monitor | string | `` | no |
| nginx_connect_silenced | Groups to mute for Nginx process monitor | map | `<map>` | no |

View File

@ -20,20 +20,22 @@ Creates a DataDog monitors with the following checks :
* PHP FPM connect
* PHP FPM load
Inputs
------
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
| environment | Architecture Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| message | Message sent when an alert is triggered | string | - | yes |
| php_fpm_busy_message | Custom message for PHP FPM busy worker monitor | string | `` | no |
| php_fpm_busy_silenced | Groups to mute for PHP FPM busy worker monitor | map | `<map>` | no |
| php_fpm_busy_threshold_critical | php fpm busy critical threshold | string | `0.9` | no |
| php_fpm_busy_threshold_warning | php fpm busy warning threshold | string | `0.8` | no |
| php_fpm_busy_timeframe | Monitor timeframe for PHP FPM busy worker [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_10m` | no |
| php_fpm_connect_message | Custom message for PHP FPM process monitor | string | `` | no |
| php_fpm_connect_silenced | Groups to mute for PHP FPM process monitor | map | `<map>` | no |
Inputs
------
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
| environment | Architecture Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| message | Message sent when an alert is triggered | string | - | yes |
| php_fpm_busy_message | Custom message for PHP FPM busy worker monitor | string | `` | no |
| php_fpm_busy_silenced | Groups to mute for PHP FPM busy worker monitor | map | `<map>` | no |
| php_fpm_busy_threshold_critical | php fpm busy critical threshold | string | `0.9` | no |
| php_fpm_busy_threshold_warning | php fpm busy warning threshold | string | `0.8` | no |
| php_fpm_busy_time_aggregator | Monitor aggregator for PHP FPM busy worker [available values: min, max or avg] | string | `avg` | no |
| php_fpm_busy_timeframe | Monitor timeframe for PHP FPM busy worker [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_10m` | no |
| php_fpm_connect_message | Custom message for PHP FPM process monitor | string | `` | no |
| php_fpm_connect_silenced | Groups to mute for PHP FPM process monitor | map | `<map>` | no |

View File

@ -38,6 +38,12 @@ variable "php_fpm_busy_message" {
default = ""
}
variable "php_fpm_busy_time_aggregator" {
description = "Monitor aggregator for PHP FPM busy worker [available values: min, max or avg]"
type = "string"
default = "avg"
}
variable "php_fpm_busy_timeframe" {
description = "Monitor timeframe for PHP FPM busy worker [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -13,7 +13,7 @@ resource "datadog_monitor" "datadog_php_fpm_connect_idle" {
type = "metric alert"
query = <<EOF
avg(${var.php_fpm_busy_timeframe}): (
${var.php_fpm_busy_time_aggregator}(${var.php_fpm_busy_timeframe}): (
avg:php_fpm.processes.active{${data.template_file.filter.rendered}} by {region, host} /
( avg:php_fpm.processes.idle{${data.template_file.filter.rendered}} by {region, host} +
avg:php_fpm.processes.active{${data.template_file.filter.rendered}} by {region, host} )

View File

@ -23,38 +23,44 @@ Creates a DataDog monitors with the following checks :
* Free disk inodes
* Free disk space
Inputs
------
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| cpu_high_message | Custom message for CPU high monitor | string | `` | no |
| cpu_high_silenced | Groups to mute for CPU high monitor | map | `<map>` | no |
| cpu_high_threshold_critical | CPU high critical threshold | string | `95` | no |
| cpu_high_threshold_warning | CPU high warning threshold | string | `80` | no |
| cpu_high_timeframe | Monitor timeframe for CPU high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| cpu_load_message | Custom message for CPU load ratio monitor | string | `` | no |
| cpu_load_silenced | Groups to mute for CPU load ratio monitor | map | `<map>` | no |
| cpu_load_threshold_critical | CPU load ratio critical threshold | string | `4` | no |
| cpu_load_threshold_warning | CPU load ratio warning threshold | string | `3` | no |
| cpu_load_timeframe | Monitor timeframe for CPU load ratio [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
| environment | Architecture Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| free_disk_inodes_message | Custom message for Free disk inodes monitor | string | `` | no |
| free_disk_inodes_silenced | Groups to mute for Free disk inodes monitor | map | `<map>` | no |
| free_disk_inodes_threshold_critical | Free disk space critical threshold | string | `5` | no |
| free_disk_inodes_threshold_warning | Free disk space warning threshold | string | `10` | no |
| free_disk_inodes_timeframe | Monitor timeframe for Free disk inodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| free_disk_space_message | Custom message for Free diskspace monitor | string | `` | no |
| free_disk_space_silenced | Groups to mute for Free diskspace monitor | map | `<map>` | no |
| free_disk_space_threshold_critical | Free disk space critical threshold | string | `5` | no |
| free_disk_space_threshold_warning | Free disk space warning threshold | string | `10` | no |
| free_disk_space_timeframe | Monitor timeframe for Free diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| free_memory_message | Custom message for Free memory monitor | string | - | yes |
| free_memory_silenced | Groups to mute for Free memory monitor | map | `<map>` | no |
| free_memory_threshold_critical | Free disk space critical threshold | string | `5` | no |
| free_memory_threshold_warning | Free disk space warning threshold | string | `10` | no |
| free_memory_timeframe | Monitor timeframe for Free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_1m` | no |
| message | Message sent when an alert is triggered | string | - | yes |
Inputs
------
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| cpu_high_message | Custom message for CPU high monitor | string | `` | no |
| cpu_high_silenced | Groups to mute for CPU high monitor | map | `<map>` | no |
| cpu_high_threshold_critical | CPU high critical threshold | string | `90` | no |
| cpu_high_threshold_warning | CPU high warning threshold | string | `85` | no |
| cpu_high_time_aggregator | Monitor aggregator for CPU high [available values: min, max or avg] | string | `min` | no |
| cpu_high_timeframe | Monitor timeframe for CPU high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_10m` | no |
| cpu_load_message | Custom message for CPU load ratio monitor | string | `` | no |
| cpu_load_silenced | Groups to mute for CPU load ratio monitor | map | `<map>` | no |
| cpu_load_threshold_critical | CPU load ratio critical threshold | string | `2.5` | no |
| cpu_load_threshold_warning | CPU load ratio warning threshold | string | `2` | no |
| cpu_load_time_aggregator | Monitor aggregator for CPU load ratio [available values: min, max or avg] | string | `min` | no |
| cpu_load_timeframe | Monitor timeframe for CPU load ratio [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
| environment | Architecture Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| free_disk_inodes_message | Custom message for Free disk inodes monitor | string | `` | no |
| free_disk_inodes_silenced | Groups to mute for Free disk inodes monitor | map | `<map>` | no |
| free_disk_inodes_threshold_critical | Free disk space critical threshold | string | `5` | no |
| free_disk_inodes_threshold_warning | Free disk space warning threshold | string | `10` | no |
| free_disk_inodes_time_aggregator | Monitor aggregator for Free disk inodes [available values: min, max or avg] | string | `min` | no |
| free_disk_inodes_timeframe | Monitor timeframe for Free disk inodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| free_disk_space_message | Custom message for Free diskspace monitor | string | `` | no |
| free_disk_space_silenced | Groups to mute for Free diskspace monitor | map | `<map>` | no |
| free_disk_space_threshold_critical | Free disk space critical threshold | string | `10` | no |
| free_disk_space_threshold_warning | Free disk space warning threshold | string | `20` | no |
| free_disk_space_time_aggregator | Monitor aggregator for Free diskspace [available values: min, max or avg] | string | `min` | no |
| free_disk_space_timeframe | Monitor timeframe for Free diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| free_memory_message | Custom message for Free memory monitor | string | - | yes |
| free_memory_silenced | Groups to mute for Free memory monitor | map | `<map>` | no |
| free_memory_threshold_critical | Free disk space critical threshold | string | `5` | no |
| free_memory_threshold_warning | Free disk space warning threshold | string | `10` | no |
| free_memory_time_aggregator | Monitor aggregator for Free memory [available values: min, max or avg] | string | `max` | no |
| free_memory_timeframe | Monitor timeframe for Free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| message | Message sent when an alert is triggered | string | - | yes |

View File

@ -38,20 +38,26 @@ variable "cpu_high_message" {
default = ""
}
variable "cpu_high_time_aggregator" {
description = "Monitor aggregator for CPU high [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "cpu_high_timeframe" {
description = "Monitor timeframe for CPU high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
default = "last_10m"
}
variable "cpu_high_threshold_warning" {
description = "CPU high warning threshold"
default = 80
default = 85
}
variable "cpu_high_threshold_critical" {
description = "CPU high critical threshold"
default = 95
default = 90
}
variable "cpu_load_silenced" {
@ -66,20 +72,26 @@ variable "cpu_load_message" {
default = ""
}
variable "cpu_load_time_aggregator" {
description = "Monitor aggregator for CPU load ratio [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "cpu_load_timeframe" {
description = "Monitor timeframe for CPU load ratio [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
default = "last_15m"
}
variable "cpu_load_threshold_warning" {
description = "CPU load ratio warning threshold"
default = 3
default = 2
}
variable "cpu_load_threshold_critical" {
description = "CPU load ratio critical threshold"
default = 4
default = 2.5
}
variable "free_disk_space_silenced" {
@ -94,6 +106,12 @@ variable "free_disk_space_message" {
default = ""
}
variable "free_disk_space_time_aggregator" {
description = "Monitor aggregator for Free diskspace [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "free_disk_space_timeframe" {
description = "Monitor timeframe for Free diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -102,12 +120,12 @@ variable "free_disk_space_timeframe" {
variable "free_disk_space_threshold_warning" {
description = "Free disk space warning threshold"
default = 10
default = 20
}
variable "free_disk_space_threshold_critical" {
description = "Free disk space critical threshold"
default = 5
default = 10
}
variable "free_disk_inodes_silenced" {
@ -122,6 +140,12 @@ variable "free_disk_inodes_message" {
default = ""
}
variable "free_disk_inodes_time_aggregator" {
description = "Monitor aggregator for Free disk inodes [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "free_disk_inodes_timeframe" {
description = "Monitor timeframe for Free disk inodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -149,10 +173,16 @@ variable "free_memory_message" {
type = "string"
}
variable "free_memory_time_aggregator" {
description = "Monitor aggregator for Free memory [available values: min, max or avg]"
type = "string"
default = "max"
}
variable "free_memory_timeframe" {
description = "Monitor timeframe for Free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_1m"
default = "last_5m"
}
variable "free_memory_threshold_warning" {

View File

@ -11,7 +11,7 @@ resource "datadog_monitor" "datadog_cpu_too_high" {
message = "${coalesce(var.cpu_high_message, var.message)}"
query = <<EOF
min(${var.cpu_high_timeframe}): (
${var.cpu_high_time_aggregator}(${var.cpu_high_timeframe}): (
100 - avg:system.cpu.idle{${data.template_file.filter.rendered}} by {region,host}
) > ${var.cpu_high_threshold_critical}
EOF
@ -42,7 +42,7 @@ resource "datadog_monitor" "datadog_load_too_high" {
message = "${coalesce(var.cpu_load_message, var.message)}"
query = <<EOF
min(${var.cpu_load_timeframe}): (
${var.cpu_load_time_aggregator}(${var.cpu_load_timeframe}): (
avg:system.load.5{${data.template_file.filter.rendered}} by {region,host} /
avg:system.core.count{${data.template_file.filter.rendered}} by {region,host}
) > ${var.cpu_load_threshold_critical}
@ -74,7 +74,7 @@ resource "datadog_monitor" "datadog_free_disk_space_too_low" {
message = "${coalesce(var.free_disk_space_message, var.message)}"
query = <<EOF
min(${var.free_disk_space_timeframe}): (
${var.free_disk_space_time_aggregator}(${var.free_disk_space_timeframe}): (
avg:system.disk.free{${data.template_file.filter.rendered},dd_disk:enabled} by {region,host,device} /
avg:system.disk.total{${data.template_file.filter.rendered},dd_disk:enabled} by {region,host,device} * 100
) < ${var.free_disk_space_threshold_critical}
@ -106,7 +106,7 @@ resource "datadog_monitor" "datadog_free_disk_space_inodes_too_low" {
message = "${coalesce(var.free_disk_inodes_message, var.message)}"
query = <<EOF
min(${var.free_disk_inodes_timeframe}): (
${var.free_disk_inodes_time_aggregator}(${var.free_disk_inodes_timeframe}): (
avg:system.fs.inodes.free{${data.template_file.filter.rendered},dd_disk:enabled} by {region,host,device} /
avg:system.fs.inodes.total{${data.template_file.filter.rendered},dd_disk:enabled} by {region,host,device} * 100
) < ${var.free_disk_inodes_threshold_critical}
@ -138,8 +138,8 @@ resource "datadog_monitor" "datadog_free_memory" {
message = "${var.free_memory_message}"
query = <<EOF
min(${var.free_memory_timeframe}): (
avg:system.mem.free{${data.template_file.filter.rendered}} by {region,host} /
${var.free_memory_time_aggregator}(${var.free_memory_timeframe}): (
avg:system.mem.usable{${data.template_file.filter.rendered}} by {region,host} /
avg:system.mem.total{${data.template_file.filter.rendered}} by {region,host} * 100
) < ${var.free_memory_threshold_critical}
EOF