Merged in MON-309-fix-all-monitors-using-as_count- (pull request #169)
MON-309 change as_count by as_rate and fix no data Approved-by: SJ <sjeoffroi@morea.fr> Approved-by: Alexandre Gaillet <alexandre.gaillet@fr.clara.net> Approved-by: Jean-Maxime LEBLANC <jean-maxime.leblanc@fr.clara.net> Approved-by: Laurent Piroelle <laurent.piroelle@fr.clara.net> Approved-by: Quentin Manfroi <quentin.manfroi@yahoo.fr>
This commit is contained in:
commit
25bcb8d619
@ -35,6 +35,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| ingress_4xx_silenced | Groups to mute for Ingress 4xx errors monitor | map | `{}` | no |
|
| ingress_4xx_silenced | Groups to mute for Ingress 4xx errors monitor | map | `{}` | no |
|
||||||
| ingress_4xx_threshold_critical | 4xx critical threshold in percentage | string | `40` | no |
|
| ingress_4xx_threshold_critical | 4xx critical threshold in percentage | string | `40` | no |
|
||||||
| ingress_4xx_threshold_warning | 4xx warning threshold in percentage | string | `20` | no |
|
| ingress_4xx_threshold_warning | 4xx warning threshold in percentage | string | `20` | no |
|
||||||
|
| ingress_4xx_time_aggregator | Monitor aggregator for Ingress 4xx errors [available values: min, max or avg] | string | `min` | no |
|
||||||
| ingress_4xx_timeframe | Monitor timeframe for Ingress 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| ingress_4xx_timeframe | Monitor timeframe for Ingress 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| ingress_5xx_enabled | Flag to enable Ingress 5xx errors monitor | string | `true` | no |
|
| ingress_5xx_enabled | Flag to enable Ingress 5xx errors monitor | string | `true` | no |
|
||||||
| ingress_5xx_extra_tags | Extra tags for Ingress 5xx errors monitor | list | `[]` | no |
|
| ingress_5xx_extra_tags | Extra tags for Ingress 5xx errors monitor | list | `[]` | no |
|
||||||
@ -42,6 +43,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| ingress_5xx_silenced | Groups to mute for Ingress 5xx errors monitor | map | `{}` | no |
|
| ingress_5xx_silenced | Groups to mute for Ingress 5xx errors monitor | map | `{}` | no |
|
||||||
| ingress_5xx_threshold_critical | 5xx critical threshold in percentage | string | `20` | no |
|
| ingress_5xx_threshold_critical | 5xx critical threshold in percentage | string | `20` | no |
|
||||||
| ingress_5xx_threshold_warning | 5xx warning threshold in percentage | string | `10` | no |
|
| ingress_5xx_threshold_warning | 5xx warning threshold in percentage | string | `10` | no |
|
||||||
|
| ingress_5xx_time_aggregator | Monitor aggregator for Ingress 5xx errors [available values: min, max or avg] | string | `min` | no |
|
||||||
| ingress_5xx_timeframe | Monitor timeframe for Ingress 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| ingress_5xx_timeframe | Monitor timeframe for Ingress 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| message | Message sent when an alert is triggered | string | - | yes |
|
| message | Message sent when an alert is triggered | string | - | yes |
|
||||||
| new_host_delay | Delay in seconds before monitor new resource | string | `300` | no |
|
| new_host_delay | Delay in seconds before monitor new resource | string | `300` | no |
|
||||||
|
|||||||
@ -58,6 +58,12 @@ variable "ingress_5xx_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "ingress_5xx_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for Ingress 5xx errors [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "ingress_5xx_timeframe" {
|
variable "ingress_5xx_timeframe" {
|
||||||
description = "Monitor timeframe for Ingress 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for Ingress 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -99,6 +105,12 @@ variable "ingress_4xx_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "ingress_4xx_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for Ingress 4xx errors [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "ingress_4xx_timeframe" {
|
variable "ingress_4xx_timeframe" {
|
||||||
description = "Monitor timeframe for Ingress 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for Ingress 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
|
|||||||
@ -4,12 +4,10 @@ resource "datadog_monitor" "nginx_ingress_too_many_5xx" {
|
|||||||
message = "${coalesce(var.ingress_5xx_message, var.message)}"
|
message = "${coalesce(var.ingress_5xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.ingress_5xx_timeframe}): (
|
${var.ingress_5xx_time_aggregator}(${var.ingress_5xx_timeframe}): (
|
||||||
default(
|
default(avg:nginx_ingress.nginx_upstream_responses_total{module.filter-tags-5xx.query_alert} by {upstream,ingress_class}.as_rate(), 0) / (
|
||||||
avg:nginx_ingress.nginx_upstream_responses_total${module.filter-tags-5xx.query_alert} by {upstream,ingress_class} /
|
default(avg:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
(avg:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class} + ${var.artificial_requests_count}),
|
* 100 > ${var.ingress_5xx_threshold_critical}
|
||||||
0) * 100
|
|
||||||
) > ${var.ingress_5xx_threshold_critical}
|
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
@ -40,12 +38,10 @@ resource "datadog_monitor" "nginx_ingress_too_many_4xx" {
|
|||||||
message = "${coalesce(var.ingress_4xx_message, var.message)}"
|
message = "${coalesce(var.ingress_4xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.ingress_4xx_timeframe}): (
|
${var.ingress_4xx_time_aggregator}(${var.ingress_4xx_timeframe}): (
|
||||||
default(
|
default(avg:nginx_ingress.nginx_upstream_responses_total{module.filter-tags-4xx.query_alert} by {upstream,ingress_class}.as_rate(), 0) / (
|
||||||
avg:nginx_ingress.nginx_upstream_responses_total${module.filter-tags-4xx.query_alert} by {upstream,ingress_class} /
|
default(avg:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
(avg:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class} + ${var.artificial_requests_count}),
|
* 100 > ${var.ingress_4xx_threshold_critical}
|
||||||
0) * 100
|
|
||||||
) > ${var.ingress_4xx_threshold_critical}
|
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|||||||
@ -45,6 +45,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| httpcode_alb_4xx_silenced | Groups to mute for ALB httpcode 4xx monitor | map | `{}` | no |
|
| httpcode_alb_4xx_silenced | Groups to mute for ALB httpcode 4xx monitor | map | `{}` | no |
|
||||||
| httpcode_alb_4xx_threshold_critical | loadbalancer 4xx critical threshold in percentage | string | `80` | no |
|
| httpcode_alb_4xx_threshold_critical | loadbalancer 4xx critical threshold in percentage | string | `80` | no |
|
||||||
| httpcode_alb_4xx_threshold_warning | loadbalancer 4xx warning threshold in percentage | string | `60` | no |
|
| httpcode_alb_4xx_threshold_warning | loadbalancer 4xx warning threshold in percentage | string | `60` | no |
|
||||||
|
| httpcode_alb_4xx_time_aggregator | Monitor aggregator for ALB httpcode 4xx [available values: min, max or avg] | string | `min` | no |
|
||||||
| httpcode_alb_4xx_timeframe | Monitor timeframe for ALB httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| httpcode_alb_4xx_timeframe | Monitor timeframe for ALB httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| httpcode_alb_5xx_enabled | Flag to enable ALB httpcode 5xx monitor | string | `true` | no |
|
| httpcode_alb_5xx_enabled | Flag to enable ALB httpcode 5xx monitor | string | `true` | no |
|
||||||
| httpcode_alb_5xx_extra_tags | Extra tags for ALB httpcode 5xx monitor | list | `[]` | no |
|
| httpcode_alb_5xx_extra_tags | Extra tags for ALB httpcode 5xx monitor | list | `[]` | no |
|
||||||
@ -52,6 +53,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| httpcode_alb_5xx_silenced | Groups to mute for ALB httpcode 5xx monitor | map | `{}` | no |
|
| httpcode_alb_5xx_silenced | Groups to mute for ALB httpcode 5xx monitor | map | `{}` | no |
|
||||||
| httpcode_alb_5xx_threshold_critical | loadbalancer 5xx critical threshold in percentage | string | `80` | no |
|
| httpcode_alb_5xx_threshold_critical | loadbalancer 5xx critical threshold in percentage | string | `80` | no |
|
||||||
| httpcode_alb_5xx_threshold_warning | loadbalancer 5xx warning threshold in percentage | string | `60` | no |
|
| httpcode_alb_5xx_threshold_warning | loadbalancer 5xx warning threshold in percentage | string | `60` | no |
|
||||||
|
| httpcode_alb_5xx_time_aggregator | Monitor aggregator for ALB httpcode 5xx [available values: min, max or avg] | string | `min` | no |
|
||||||
| httpcode_alb_5xx_timeframe | Monitor timeframe for ALB httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| httpcode_alb_5xx_timeframe | Monitor timeframe for ALB httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| httpcode_target_4xx_enabled | Flag to enable ALB target httpcode 4xx monitor | string | `true` | no |
|
| httpcode_target_4xx_enabled | Flag to enable ALB target httpcode 4xx monitor | string | `true` | no |
|
||||||
| httpcode_target_4xx_extra_tags | Extra tags for ALB target httpcode 4xx monitor | list | `[]` | no |
|
| httpcode_target_4xx_extra_tags | Extra tags for ALB target httpcode 4xx monitor | list | `[]` | no |
|
||||||
@ -59,6 +61,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| httpcode_target_4xx_silenced | Groups to mute for ALB target httpcode 4xx monitor | map | `{}` | no |
|
| httpcode_target_4xx_silenced | Groups to mute for ALB target httpcode 4xx monitor | map | `{}` | no |
|
||||||
| httpcode_target_4xx_threshold_critical | target 4xx critical threshold in percentage | string | `80` | no |
|
| httpcode_target_4xx_threshold_critical | target 4xx critical threshold in percentage | string | `80` | no |
|
||||||
| httpcode_target_4xx_threshold_warning | target 4xx warning threshold in percentage | string | `60` | no |
|
| httpcode_target_4xx_threshold_warning | target 4xx warning threshold in percentage | string | `60` | no |
|
||||||
|
| httpcode_target_4xx_time_aggregator | Monitor aggregator for ALB target httpcode 4xx [available values: min, max or avg] | string | `min` | no |
|
||||||
| httpcode_target_4xx_timeframe | Monitor timeframe for ALB target httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| httpcode_target_4xx_timeframe | Monitor timeframe for ALB target httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| httpcode_target_5xx_enabled | Flag to enable ALB target httpcode 5xx monitor | string | `true` | no |
|
| httpcode_target_5xx_enabled | Flag to enable ALB target httpcode 5xx monitor | string | `true` | no |
|
||||||
| httpcode_target_5xx_extra_tags | Extra tags for ALB target httpcode 5xx monitor | list | `[]` | no |
|
| httpcode_target_5xx_extra_tags | Extra tags for ALB target httpcode 5xx monitor | list | `[]` | no |
|
||||||
@ -66,6 +69,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| httpcode_target_5xx_silenced | Groups to mute for ALB target httpcode 5xx monitor | map | `{}` | no |
|
| httpcode_target_5xx_silenced | Groups to mute for ALB target httpcode 5xx monitor | map | `{}` | no |
|
||||||
| httpcode_target_5xx_threshold_critical | target 5xx critical threshold in percentage | string | `80` | no |
|
| httpcode_target_5xx_threshold_critical | target 5xx critical threshold in percentage | string | `80` | no |
|
||||||
| httpcode_target_5xx_threshold_warning | target 5xx warning threshold in percentage | string | `60` | no |
|
| httpcode_target_5xx_threshold_warning | target 5xx warning threshold in percentage | string | `60` | no |
|
||||||
|
| httpcode_target_5xx_time_aggregator | Monitor aggregator for ALB target httpcode 5xx [available values: min, max or avg] | string | `min` | no |
|
||||||
| httpcode_target_5xx_timeframe | Monitor timeframe for ALB target httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| httpcode_target_5xx_timeframe | Monitor timeframe for ALB target httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| latency_enabled | Flag to enable ALB latency monitor | string | `true` | no |
|
| latency_enabled | Flag to enable ALB latency monitor | string | `true` | no |
|
||||||
| latency_extra_tags | Extra tags for ALB latency monitor | list | `[]` | no |
|
| latency_extra_tags | Extra tags for ALB latency monitor | list | `[]` | no |
|
||||||
|
|||||||
@ -142,6 +142,12 @@ variable "httpcode_alb_4xx_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "httpcode_alb_4xx_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for ALB httpcode 4xx [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "httpcode_alb_4xx_timeframe" {
|
variable "httpcode_alb_4xx_timeframe" {
|
||||||
description = "Monitor timeframe for ALB httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for ALB httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -182,6 +188,12 @@ variable "httpcode_target_4xx_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "httpcode_target_4xx_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for ALB target httpcode 4xx [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "httpcode_target_4xx_timeframe" {
|
variable "httpcode_target_4xx_timeframe" {
|
||||||
description = "Monitor timeframe for ALB target httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for ALB target httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -222,6 +234,12 @@ variable "httpcode_alb_5xx_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "httpcode_alb_5xx_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for ALB httpcode 5xx [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "httpcode_alb_5xx_timeframe" {
|
variable "httpcode_alb_5xx_timeframe" {
|
||||||
description = "Monitor timeframe for ALB httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for ALB httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -262,6 +280,12 @@ variable "httpcode_target_5xx_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "httpcode_target_5xx_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for ALB target httpcode 5xx [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "httpcode_target_5xx_timeframe" {
|
variable "httpcode_target_5xx_timeframe" {
|
||||||
description = "Monitor timeframe for ALB target httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for ALB target httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
|
|||||||
@ -69,9 +69,9 @@ resource "datadog_monitor" "ALB_httpcode_5xx" {
|
|||||||
message = "${coalesce(var.httpcode_alb_5xx_message, var.message)}"
|
message = "${coalesce(var.httpcode_alb_5xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.httpcode_alb_5xx_timeframe}):
|
${var.httpcode_alb_5xx_time_aggregator}(${var.httpcode_alb_5xx_timeframe}):
|
||||||
default(avg:aws.applicationelb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_count(), 0) / (
|
default(avg:aws.applicationelb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
||||||
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_count(), 0) + ${var.artificial_requests_count})
|
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.httpcode_alb_5xx_threshold_critical}
|
* 100 > ${var.httpcode_alb_5xx_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -101,9 +101,9 @@ resource "datadog_monitor" "ALB_httpcode_4xx" {
|
|||||||
message = "${coalesce(var.httpcode_alb_4xx_message, var.message)}"
|
message = "${coalesce(var.httpcode_alb_4xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.httpcode_alb_4xx_timeframe}):
|
${var.httpcode_alb_4xx_time_aggregator}(${var.httpcode_alb_4xx_timeframe}):
|
||||||
default(avg:aws.applicationelb.httpcode_elb_4xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_count(), 0) / (
|
default(avg:aws.applicationelb.httpcode_elb_4xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
||||||
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_count(), 0) + ${var.artificial_requests_count})
|
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.httpcode_alb_4xx_threshold_critical}
|
* 100 > ${var.httpcode_alb_4xx_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -133,9 +133,9 @@ resource "datadog_monitor" "ALB_httpcode_target_5xx" {
|
|||||||
message = "${coalesce(var.httpcode_target_5xx_message, var.message)}"
|
message = "${coalesce(var.httpcode_target_5xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.httpcode_target_5xx_timeframe}):
|
${var.httpcode_target_5xx_time_aggregator}(${var.httpcode_target_5xx_timeframe}):
|
||||||
default(avg:aws.applicationelb.httpcode_target_5xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_count(), 0) / (
|
default(avg:aws.applicationelb.httpcode_target_5xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
||||||
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_count(), 0) + ${var.artificial_requests_count})
|
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.httpcode_target_5xx_threshold_critical}
|
* 100 > ${var.httpcode_target_5xx_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -165,9 +165,9 @@ resource "datadog_monitor" "ALB_httpcode_target_4xx" {
|
|||||||
message = "${coalesce(var.httpcode_target_4xx_message, var.message)}"
|
message = "${coalesce(var.httpcode_target_4xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.httpcode_target_4xx_timeframe}):
|
${var.httpcode_target_4xx_time_aggregator}(${var.httpcode_target_4xx_timeframe}):
|
||||||
default(avg:aws.applicationelb.httpcode_target_4xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_count(), 0) / (
|
default(avg:aws.applicationelb.httpcode_target_4xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
||||||
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_count(), 0) + ${var.artificial_requests_count})
|
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.httpcode_target_4xx_threshold_critical}
|
* 100 > ${var.httpcode_target_4xx_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
|||||||
@ -34,6 +34,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| http_4xx_requests_silenced | Groups to mute for API Gateway HTTP 4xx requests monitor | map | `{}` | no |
|
| http_4xx_requests_silenced | Groups to mute for API Gateway HTTP 4xx requests monitor | map | `{}` | no |
|
||||||
| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no |
|
| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no |
|
||||||
| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no |
|
| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no |
|
||||||
|
| http_4xx_requests_time_aggregator | Monitor aggregator for API HTTP 4xx requests [available values: min, max or avg] | string | `min` | no |
|
||||||
| http_4xx_requests_timeframe | Monitor timeframe for API HTTP 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| http_4xx_requests_timeframe | Monitor timeframe for API HTTP 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| http_5xx_requests_enabled | Flag to enable API Gateway HTTP 5xx requests monitor | string | `true` | no |
|
| http_5xx_requests_enabled | Flag to enable API Gateway HTTP 5xx requests monitor | string | `true` | no |
|
||||||
| http_5xx_requests_extra_tags | Extra tags for API Gateway HTTP 5xx requests monitor | list | `[]` | no |
|
| http_5xx_requests_extra_tags | Extra tags for API Gateway HTTP 5xx requests monitor | list | `[]` | no |
|
||||||
@ -41,6 +42,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| http_5xx_requests_silenced | Groups to mute for API Gateway HTTP 5xx requests monitor | map | `{}` | no |
|
| http_5xx_requests_silenced | Groups to mute for API Gateway HTTP 5xx requests monitor | map | `{}` | no |
|
||||||
| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no |
|
| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no |
|
||||||
| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no |
|
| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no |
|
||||||
|
| http_5xx_requests_time_aggregator | Monitor aggregator for API HTTP 5xx requests [available values: min, max or avg] | string | `min` | no |
|
||||||
| http_5xx_requests_timeframe | Monitor timeframe for API HTTP 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| http_5xx_requests_timeframe | Monitor timeframe for API HTTP 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| latency_enabled | Flag to enable API Gateway latency monitor | string | `true` | no |
|
| latency_enabled | Flag to enable API Gateway latency monitor | string | `true` | no |
|
||||||
| latency_extra_tags | Extra tags for API Gateway latency monitor | list | `[]` | no |
|
| latency_extra_tags | Extra tags for API Gateway latency monitor | list | `[]` | no |
|
||||||
|
|||||||
@ -100,6 +100,12 @@ variable "http_5xx_requests_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "http_5xx_requests_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for API HTTP 5xx requests [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "http_5xx_requests_timeframe" {
|
variable "http_5xx_requests_timeframe" {
|
||||||
description = "Monitor timeframe for API HTTP 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for API HTTP 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -144,6 +150,12 @@ variable "http_4xx_requests_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "http_4xx_requests_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for API HTTP 4xx requests [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "http_4xx_requests_timeframe" {
|
variable "http_4xx_requests_timeframe" {
|
||||||
description = "Monitor timeframe for API HTTP 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for API HTTP 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
|
|||||||
@ -38,9 +38,9 @@ resource "datadog_monitor" "API_http_5xx_errors_count" {
|
|||||||
message = "${coalesce(var.http_5xx_requests_message, var.message)}"
|
message = "${coalesce(var.http_5xx_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.http_5xx_requests_timeframe}):
|
${var.http_5xx_requests_time_aggregator}(${var.http_5xx_requests_timeframe}):
|
||||||
default(avg:aws.apigateway.5xxerror{${var.filter_tags}} by {region,apiname,stage}.as_count(), 0) / (
|
default(avg:aws.apigateway.5xxerror{${var.filter_tags}} by {region,apiname,stage}.as_rate(), 0) / (
|
||||||
default(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname,stage}.as_count(), 0) + ${var.artificial_requests_count})
|
default(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname,stage}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.http_5xx_requests_threshold_critical}
|
* 100 > ${var.http_5xx_requests_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -71,9 +71,9 @@ resource "datadog_monitor" "API_http_4xx_errors_count" {
|
|||||||
message = "${coalesce(var.http_4xx_requests_message, var.message)}"
|
message = "${coalesce(var.http_4xx_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.http_4xx_requests_timeframe}):
|
${var.http_4xx_requests_time_aggregator}(${var.http_4xx_requests_timeframe}):
|
||||||
default(avg:aws.apigateway.4xxerror{${var.filter_tags}} by {region,apiname,stage}.as_count(), 0) / (
|
default(avg:aws.apigateway.4xxerror{${var.filter_tags}} by {region,apiname,stage}.as_rate(), 0) / (
|
||||||
default(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname,stage}.as_count(), 0) + ${var.artificial_requests_count})
|
default(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname,stage}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.http_4xx_requests_threshold_critical}
|
* 100 > ${var.http_4xx_requests_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
|||||||
@ -42,6 +42,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| get_hits_silenced | Groups to mute for Elasticache memcached get hits monitor | map | `{}` | no |
|
| get_hits_silenced | Groups to mute for Elasticache memcached get hits monitor | map | `{}` | no |
|
||||||
| get_hits_threshold_critical | Elasticache memcached get hits critical threshold in percentage | string | `60` | no |
|
| get_hits_threshold_critical | Elasticache memcached get hits critical threshold in percentage | string | `60` | no |
|
||||||
| get_hits_threshold_warning | Elasticache memcached get hits warning threshold in percentage | string | `80` | no |
|
| get_hits_threshold_warning | Elasticache memcached get hits warning threshold in percentage | string | `80` | no |
|
||||||
|
| get_hits_time_aggregator | Monitor aggregator for Elasticache memcached get hits [available values: min, max or avg] | string | `max` | no |
|
||||||
| get_hits_timeframe | Monitor timeframe for Elasticache memcached get hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
|
| get_hits_timeframe | Monitor timeframe for Elasticache memcached get hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
|
||||||
| message | Message sent when an alert is triggered | string | - | yes |
|
| message | Message sent when an alert is triggered | string | - | yes |
|
||||||
| new_host_delay | Delay in seconds before monitor new resource | string | `300` | no |
|
| new_host_delay | Delay in seconds before monitor new resource | string | `300` | no |
|
||||||
|
|||||||
@ -59,6 +59,12 @@ variable "get_hits_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "get_hits_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for Elasticache memcached get hits [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "max"
|
||||||
|
}
|
||||||
|
|
||||||
variable "get_hits_timeframe" {
|
variable "get_hits_timeframe" {
|
||||||
description = "Monitor timeframe for Elasticache memcached get hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for Elasticache memcached get hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
|
|||||||
@ -6,10 +6,10 @@ resource "datadog_monitor" "memcached_get_hits" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.get_hits_timeframe}): (
|
${var.get_hits_time_aggregator}(${var.get_hits_timeframe}): (
|
||||||
avg:aws.elasticache.get_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count() /
|
default(avg:aws.elasticache.get_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0) / (
|
||||||
(avg:aws.elasticache.get_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count() +
|
default(avg:aws.elasticache.get_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0) +
|
||||||
avg:aws.elasticache.get_misses${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count())
|
default(avg:aws.elasticache.get_misses${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0))
|
||||||
) * 100 < ${var.get_hits_threshold_critical}
|
) * 100 < ${var.get_hits_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
|||||||
@ -31,6 +31,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| cache_hits_silenced | Groups to mute for Elasticache redis cache hits monitor | map | `{}` | no |
|
| cache_hits_silenced | Groups to mute for Elasticache redis cache hits monitor | map | `{}` | no |
|
||||||
| cache_hits_threshold_critical | Elasticache redis cache hits critical threshold in percentage | string | `60` | no |
|
| cache_hits_threshold_critical | Elasticache redis cache hits critical threshold in percentage | string | `60` | no |
|
||||||
| cache_hits_threshold_warning | Elasticache redis cache hits warning threshold in percentage | string | `80` | no |
|
| cache_hits_threshold_warning | Elasticache redis cache hits warning threshold in percentage | string | `80` | no |
|
||||||
|
| cache_hits_time_aggregator | Monitor aggregator for Elasticache redis cache hits [available values: min, max or avg] | string | `max` | no |
|
||||||
| cache_hits_timeframe | Monitor timeframe for Elasticache redis cache hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
|
| cache_hits_timeframe | Monitor timeframe for Elasticache redis cache hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
|
||||||
| commands_enabled | Flag to enable Elasticache redis commands monitor | string | `true` | no |
|
| commands_enabled | Flag to enable Elasticache redis commands monitor | string | `true` | no |
|
||||||
| commands_extra_tags | Extra tags for Elasticache redis commands monitor | list | `[]` | no |
|
| commands_extra_tags | Extra tags for Elasticache redis commands monitor | list | `[]` | no |
|
||||||
@ -43,7 +44,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| cpu_high_silenced | Groups to mute for Elasticache redis cpu high monitor | map | `{}` | no |
|
| cpu_high_silenced | Groups to mute for Elasticache redis cpu high monitor | map | `{}` | no |
|
||||||
| cpu_high_threshold_critical | Elasticache redis cpu high critical threshold in percentage | string | `90` | no |
|
| cpu_high_threshold_critical | Elasticache redis cpu high critical threshold in percentage | string | `90` | no |
|
||||||
| cpu_high_threshold_warning | Elasticache redis cpu high warning threshold in percentage | string | `75` | no |
|
| cpu_high_threshold_warning | Elasticache redis cpu high warning threshold in percentage | string | `75` | no |
|
||||||
| cpu_high_time_aggregator | Monitor aggregator for Elasticache redis cpu high [available values: min, max or avg] | string | `avg` | no |
|
| cpu_high_time_aggregator | Monitor aggregator for Elasticache redis cpu high [available values: min, max or avg] | string | `min` | no |
|
||||||
| cpu_high_timeframe | Monitor timeframe for Elasticache redis cpu high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
|
| cpu_high_timeframe | Monitor timeframe for Elasticache redis cpu high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
|
||||||
| environment | Infrastructure Environment | string | - | yes |
|
| environment | Infrastructure Environment | string | - | yes |
|
||||||
| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||||
|
|||||||
@ -59,6 +59,12 @@ variable "cache_hits_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "cache_hits_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for Elasticache redis cache hits [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "max"
|
||||||
|
}
|
||||||
|
|
||||||
variable "cache_hits_timeframe" {
|
variable "cache_hits_timeframe" {
|
||||||
description = "Monitor timeframe for Elasticache redis cache hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for Elasticache redis cache hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -104,7 +110,7 @@ variable "cpu_high_message" {
|
|||||||
variable "cpu_high_time_aggregator" {
|
variable "cpu_high_time_aggregator" {
|
||||||
description = "Monitor aggregator for Elasticache redis cpu high [available values: min, max or avg]"
|
description = "Monitor aggregator for Elasticache redis cpu high [available values: min, max or avg]"
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "avg"
|
default = "min"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "cpu_high_timeframe" {
|
variable "cpu_high_timeframe" {
|
||||||
|
|||||||
@ -6,10 +6,10 @@ resource "datadog_monitor" "redis_cache_hits" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.cache_hits_timeframe}): (
|
${var.cache_hits_time_aggregator}(${var.cache_hits_timeframe}): (
|
||||||
avg:aws.elasticache.cache_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count() /
|
default(avg:aws.elasticache.cache_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0) / (
|
||||||
(avg:aws.elasticache.cache_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count() +
|
default(avg:aws.elasticache.cache_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0) +
|
||||||
avg:aws.elasticache.cache_misses${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count())
|
default(avg:aws.elasticache.cache_misses${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0))
|
||||||
) * 100 < ${var.cache_hits_threshold_critical}
|
) * 100 < ${var.cache_hits_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
|||||||
@ -40,8 +40,8 @@ resource "datadog_monitor" "ELB_too_much_4xx" {
|
|||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.elb_4xx_timeframe}):
|
sum(${var.elb_4xx_timeframe}):
|
||||||
default(avg:aws.elb.httpcode_elb_4xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_count(), 0) / (
|
default(avg:aws.elb.httpcode_elb_4xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
||||||
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_count(), 0) + ${var.artificial_requests_count})
|
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.elb_4xx_threshold_critical}
|
* 100 > ${var.elb_4xx_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -74,8 +74,8 @@ resource "datadog_monitor" "ELB_too_much_5xx" {
|
|||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.elb_5xx_timeframe}):
|
sum(${var.elb_5xx_timeframe}):
|
||||||
default(avg:aws.elb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_count(), 0) / (
|
default(avg:aws.elb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
||||||
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_count(), 0) + ${var.artificial_requests_count})
|
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.elb_5xx_threshold_critical}
|
* 100 > ${var.elb_5xx_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -108,8 +108,8 @@ resource "datadog_monitor" "ELB_too_much_4xx_backend" {
|
|||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.elb_backend_4xx_timeframe}):
|
sum(${var.elb_backend_4xx_timeframe}):
|
||||||
default(avg:aws.elb.httpcode_backend_4xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_count(), 0) / (
|
default(avg:aws.elb.httpcode_backend_4xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
||||||
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_count(), 0) + ${var.artificial_requests_count})
|
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.elb_backend_4xx_threshold_critical}
|
* 100 > ${var.elb_backend_4xx_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -142,8 +142,8 @@ resource "datadog_monitor" "ELB_too_much_5xx_backend" {
|
|||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.elb_backend_5xx_timeframe}):
|
sum(${var.elb_backend_5xx_timeframe}):
|
||||||
default(avg:aws.elb.httpcode_backend_5xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_count(), 0) / (
|
default(avg:aws.elb.httpcode_backend_5xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
||||||
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_count(), 0) + ${var.artificial_requests_count})
|
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.elb_backend_5xx_threshold_critical}
|
* 100 > ${var.elb_backend_5xx_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
|||||||
@ -34,6 +34,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| failed_requests_silenced | Groups to mute for API Management failed requests monitor | map | `{}` | no |
|
| failed_requests_silenced | Groups to mute for API Management failed requests monitor | map | `{}` | no |
|
||||||
| failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `90` | no |
|
| failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `90` | no |
|
||||||
| failed_requests_threshold_warning | Warning regarding acceptable percent of failed requests | string | `50` | no |
|
| failed_requests_threshold_warning | Warning regarding acceptable percent of failed requests | string | `50` | no |
|
||||||
|
| failed_requests_time_aggregator | Monitor aggregator for API Management failed requests [available values: min, max or avg] | string | `min` | no |
|
||||||
| failed_requests_timeframe | Monitor timeframe for API Management failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| failed_requests_timeframe | Monitor timeframe for API Management failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||||
| filter_tags_custom_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `` | no |
|
| filter_tags_custom_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `` | no |
|
||||||
@ -46,6 +47,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| other_requests_silenced | Groups to mute for API Management other requests monitor | map | `{}` | no |
|
| other_requests_silenced | Groups to mute for API Management other requests monitor | map | `{}` | no |
|
||||||
| other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `90` | no |
|
| other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `90` | no |
|
||||||
| other_requests_threshold_warning | Warning regarding acceptable percent of other requests | string | `50` | no |
|
| other_requests_threshold_warning | Warning regarding acceptable percent of other requests | string | `50` | no |
|
||||||
|
| other_requests_time_aggregator | Monitor aggregator for API Management other requests [available values: min, max or avg] | string | `min` | no |
|
||||||
| other_requests_timeframe | Monitor timeframe for API Management other requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| other_requests_timeframe | Monitor timeframe for API Management other requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| status_enabled | Flag to enable API Management status monitor | string | `true` | no |
|
| status_enabled | Flag to enable API Management status monitor | string | `true` | no |
|
||||||
| status_extra_tags | Extra tags for API Management status monitor | list | `[]` | no |
|
| status_extra_tags | Extra tags for API Management status monitor | list | `[]` | no |
|
||||||
@ -59,6 +61,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| successful_requests_silenced | Groups to mute for API Management successful requests monitor | map | `{}` | no |
|
| successful_requests_silenced | Groups to mute for API Management successful requests monitor | map | `{}` | no |
|
||||||
| successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `10` | no |
|
| successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `10` | no |
|
||||||
| successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests | string | `30` | no |
|
| successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests | string | `30` | no |
|
||||||
|
| successful_requests_time_aggregator | Monitor aggregator for API Management successful requests [available values: min, max or avg] | string | `max` | no |
|
||||||
| successful_requests_timeframe | Monitor timeframe for API Management successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| successful_requests_timeframe | Monitor timeframe for API Management successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| unauthorized_requests_enabled | Flag to enable API Management unauthorized requests monitor | string | `true` | no |
|
| unauthorized_requests_enabled | Flag to enable API Management unauthorized requests monitor | string | `true` | no |
|
||||||
| unauthorized_requests_extra_tags | Extra tags for API Management unauthorized requests monitor | list | `[]` | no |
|
| unauthorized_requests_extra_tags | Extra tags for API Management unauthorized requests monitor | list | `[]` | no |
|
||||||
@ -66,6 +69,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| unauthorized_requests_silenced | Groups to mute for API Management unauthorized requests monitor | map | `{}` | no |
|
| unauthorized_requests_silenced | Groups to mute for API Management unauthorized requests monitor | map | `{}` | no |
|
||||||
| unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `90` | no |
|
| unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `90` | no |
|
||||||
| unauthorized_requests_threshold_warning | Warning regarding acceptable percent of unauthorized requests | string | `50` | no |
|
| unauthorized_requests_threshold_warning | Warning regarding acceptable percent of unauthorized requests | string | `50` | no |
|
||||||
|
| unauthorized_requests_time_aggregator | Monitor aggregator for API Management unauthorized requests [available values: min, max or avg] | string | `min` | no |
|
||||||
| unauthorized_requests_timeframe | Monitor timeframe for API Management unauthorized requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| unauthorized_requests_timeframe | Monitor timeframe for API Management unauthorized requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
|
|
||||||
## Outputs
|
## Outputs
|
||||||
|
|||||||
@ -95,6 +95,12 @@ variable "failed_requests_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "failed_requests_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for API Management failed requests [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "failed_requests_timeframe" {
|
variable "failed_requests_timeframe" {
|
||||||
description = "Monitor timeframe for API Management failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for API Management failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -135,6 +141,12 @@ variable "other_requests_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "other_requests_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for API Management other requests [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "other_requests_timeframe" {
|
variable "other_requests_timeframe" {
|
||||||
description = "Monitor timeframe for API Management other requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for API Management other requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -175,6 +187,12 @@ variable "unauthorized_requests_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "unauthorized_requests_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for API Management unauthorized requests [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "unauthorized_requests_timeframe" {
|
variable "unauthorized_requests_timeframe" {
|
||||||
description = "Monitor timeframe for API Management unauthorized requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for API Management unauthorized requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -215,6 +233,12 @@ variable "successful_requests_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "successful_requests_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for API Management successful requests [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "max"
|
||||||
|
}
|
||||||
|
|
||||||
variable "successful_requests_timeframe" {
|
variable "successful_requests_timeframe" {
|
||||||
description = "Monitor timeframe for API Management successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for API Management successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
|
|||||||
@ -34,10 +34,10 @@ resource "datadog_monitor" "apimgt_failed_requests" {
|
|||||||
message = "${coalesce(var.failed_requests_message, var.message)}"
|
message = "${coalesce(var.failed_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.failed_requests_timeframe}): (
|
${var.failed_requests_time_aggregator}(${var.failed_requests_timeframe}): (
|
||||||
avg:azure.apimanagement_service.failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() /
|
default(avg:azure.apimanagement_service.failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() * 100
|
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
) > ${var.failed_requests_threshold_critical}
|
) * 100 > ${var.failed_requests_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
@ -67,10 +67,10 @@ resource "datadog_monitor" "apimgt_other_requests" {
|
|||||||
message = "${coalesce(var.other_requests_message, var.message)}"
|
message = "${coalesce(var.other_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.other_requests_timeframe}): (
|
${var.other_requests_time_aggregator}(${var.other_requests_timeframe}): (
|
||||||
avg:azure.apimanagement_service.other_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() /
|
default(avg:azure.apimanagement_service.other_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() * 100
|
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
) > ${var.other_requests_threshold_critical}
|
) * 100 > ${var.other_requests_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
@ -100,10 +100,10 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" {
|
|||||||
message = "${coalesce(var.unauthorized_requests_message, var.message)}"
|
message = "${coalesce(var.unauthorized_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.unauthorized_requests_timeframe}): (
|
${var.unauthorized_requests_time_aggregator}(${var.unauthorized_requests_timeframe}): (
|
||||||
avg:azure.apimanagement_service.unauthorized_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() /
|
default(avg:azure.apimanagement_service.unauthorized_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() * 100
|
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
) > ${var.unauthorized_requests_threshold_critical}
|
) * 100 > ${var.unauthorized_requests_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
@ -133,10 +133,10 @@ resource "datadog_monitor" "apimgt_successful_requests" {
|
|||||||
message = "${coalesce(var.successful_requests_message, var.message)}"
|
message = "${coalesce(var.successful_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.successful_requests_timeframe}): (
|
${var.successful_requests_time_aggregator}(${var.successful_requests_timeframe}): (
|
||||||
avg:azure.apimanagement_service.successful_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() /
|
default(avg:azure.apimanagement_service.successful_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1) /
|
||||||
avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() * 100
|
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
) < ${var.successful_requests_threshold_critical}
|
) * 100 < ${var.successful_requests_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
|
|||||||
@ -37,6 +37,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| http_4xx_requests_silenced | Groups to mute for App Services 4xx requests monitor | map | `{}` | no |
|
| http_4xx_requests_silenced | Groups to mute for App Services 4xx requests monitor | map | `{}` | no |
|
||||||
| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `90` | no |
|
| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `90` | no |
|
||||||
| http_4xx_requests_threshold_warning | Warning regarding acceptable percent of 4xx errors | string | `50` | no |
|
| http_4xx_requests_threshold_warning | Warning regarding acceptable percent of 4xx errors | string | `50` | no |
|
||||||
|
| http_4xx_requests_time_aggregator | Monitor aggregator for App Services 4xx requests [available values: min, max or avg] | string | `min` | no |
|
||||||
| http_4xx_requests_timeframe | Monitor timeframe for App Services 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| http_4xx_requests_timeframe | Monitor timeframe for App Services 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| http_5xx_requests_enabled | Flag to enable App Services 5xx requests monitor | string | `true` | no |
|
| http_5xx_requests_enabled | Flag to enable App Services 5xx requests monitor | string | `true` | no |
|
||||||
| http_5xx_requests_extra_tags | Extra tags for App Services 5xx requests monitor | list | `[]` | no |
|
| http_5xx_requests_extra_tags | Extra tags for App Services 5xx requests monitor | list | `[]` | no |
|
||||||
@ -44,6 +45,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| http_5xx_requests_silenced | Groups to mute for App Services 5xx requests monitor | map | `{}` | no |
|
| http_5xx_requests_silenced | Groups to mute for App Services 5xx requests monitor | map | `{}` | no |
|
||||||
| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `90` | no |
|
| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `90` | no |
|
||||||
| http_5xx_requests_threshold_warning | Warning regarding acceptable percent of 5xx errors | string | `50` | no |
|
| http_5xx_requests_threshold_warning | Warning regarding acceptable percent of 5xx errors | string | `50` | no |
|
||||||
|
| http_5xx_requests_time_aggregator | Monitor aggregator for App Services 5xx requests [available values: min, max or avg] | string | `min` | no |
|
||||||
| http_5xx_requests_timeframe | Monitor timeframe for App Services 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| http_5xx_requests_timeframe | Monitor timeframe for App Services 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| http_successful_requests_enabled | Flag to enable App Services successful requests monitor | string | `true` | no |
|
| http_successful_requests_enabled | Flag to enable App Services successful requests monitor | string | `true` | no |
|
||||||
| http_successful_requests_extra_tags | Extra tags for App Services successful requests monitor | list | `[]` | no |
|
| http_successful_requests_extra_tags | Extra tags for App Services successful requests monitor | list | `[]` | no |
|
||||||
@ -51,6 +53,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| http_successful_requests_silenced | Groups to mute for App Services successful requests monitor | map | `{}` | no |
|
| http_successful_requests_silenced | Groups to mute for App Services successful requests monitor | map | `{}` | no |
|
||||||
| http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `10` | no |
|
| http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `10` | no |
|
||||||
| http_successful_requests_threshold_warning | Warning regarding acceptable percent of 2xx & 3xx requests | string | `30` | no |
|
| http_successful_requests_threshold_warning | Warning regarding acceptable percent of 2xx & 3xx requests | string | `30` | no |
|
||||||
|
| http_successful_requests_time_aggregator | Monitor aggregator for App Services successful requests [available values: min, max or avg] | string | `max` | no |
|
||||||
| http_successful_requests_timeframe | Monitor timeframe for App Services successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| http_successful_requests_timeframe | Monitor timeframe for App Services successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| memory_usage_enabled | Flag to enable App Services memory usage monitor | string | `true` | no |
|
| memory_usage_enabled | Flag to enable App Services memory usage monitor | string | `true` | no |
|
||||||
| memory_usage_extra_tags | Extra tags for App Services memory usage monitor | list | `[]` | no |
|
| memory_usage_extra_tags | Extra tags for App Services memory usage monitor | list | `[]` | no |
|
||||||
|
|||||||
@ -149,6 +149,12 @@ variable "http_4xx_requests_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "http_4xx_requests_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for App Services 4xx requests [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "http_4xx_requests_timeframe" {
|
variable "http_4xx_requests_timeframe" {
|
||||||
description = "Monitor timeframe for App Services 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for App Services 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -189,6 +195,12 @@ variable "http_5xx_requests_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "http_5xx_requests_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for App Services 5xx requests [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "http_5xx_requests_timeframe" {
|
variable "http_5xx_requests_timeframe" {
|
||||||
description = "Monitor timeframe for App Services 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for App Services 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -229,6 +241,12 @@ variable "http_successful_requests_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "http_successful_requests_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for App Services successful requests [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "max"
|
||||||
|
}
|
||||||
|
|
||||||
variable "http_successful_requests_timeframe" {
|
variable "http_successful_requests_timeframe" {
|
||||||
description = "Monitor timeframe for App Services successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for App Services successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
|
|||||||
@ -70,9 +70,9 @@ resource "datadog_monitor" "appservices_http_5xx_errors_count" {
|
|||||||
message = "${coalesce(var.http_5xx_requests_message, var.message)}"
|
message = "${coalesce(var.http_5xx_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.http_5xx_requests_timeframe}): (
|
${var.http_5xx_requests_time_aggregator}(${var.http_5xx_requests_timeframe}): (
|
||||||
avg:azure.app_services.http5xx${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() /
|
default(avg:azure.app_services.http5xx${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count()
|
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
) * 100 > ${var.http_5xx_requests_threshold_critical}
|
) * 100 > ${var.http_5xx_requests_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -103,9 +103,9 @@ resource "datadog_monitor" "appservices_http_4xx_errors_count" {
|
|||||||
message = "${coalesce(var.http_4xx_requests_message, var.message)}"
|
message = "${coalesce(var.http_4xx_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.http_4xx_requests_timeframe}): (
|
${var.http_4xx_requests_time_aggregator}(${var.http_4xx_requests_timeframe}): (
|
||||||
avg:azure.app_services.http4xx${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() /
|
default(avg:azure.app_services.http4xx${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count()
|
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
) * 100 > ${var.http_4xx_requests_threshold_critical}
|
) * 100 > ${var.http_4xx_requests_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -136,10 +136,10 @@ resource "datadog_monitor" "appservices_http_success_status_rate" {
|
|||||||
message = "${coalesce(var.http_successful_requests_message, var.message)}"
|
message = "${coalesce(var.http_successful_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.http_successful_requests_timeframe}): (
|
${var.http_successful_requests_time_aggregator}(${var.http_successful_requests_timeframe}): ( (
|
||||||
(avg:azure.app_services.http2xx${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() +
|
default(avg:azure.app_services.http2xx${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.app_services.http3xx${module.filter-tags.query_alert} by {resource_group,region,name}.as_count()) /
|
default(avg:azure.app_services.http3xx${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) ) /
|
||||||
avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count()
|
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
) * 100 < ${var.http_successful_requests_threshold_critical}
|
) * 100 < ${var.http_successful_requests_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
|||||||
@ -28,7 +28,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| cosmos_db_4xx_request_extra_tags | Extra tags for Cosmos DB 4xx requests monitor | list | `[]` | no |
|
| cosmos_db_4xx_request_extra_tags | Extra tags for Cosmos DB 4xx requests monitor | list | `[]` | no |
|
||||||
| cosmos_db_4xx_request_rate_threshold_critical | Critical threshold for Cosmos DB 4xx requests monitor | string | `80` | no |
|
| cosmos_db_4xx_request_rate_threshold_critical | Critical threshold for Cosmos DB 4xx requests monitor | string | `80` | no |
|
||||||
| cosmos_db_4xx_request_rate_threshold_warning | Warning threshold for Cosmos DB 4xx requests monitor | string | `50` | no |
|
| cosmos_db_4xx_request_rate_threshold_warning | Warning threshold for Cosmos DB 4xx requests monitor | string | `50` | no |
|
||||||
| cosmos_db_4xx_request_time_aggregator | Monitor aggregator for Cosmos DB 4xx requests [available values: min, max or avg] | string | `sum` | no |
|
| cosmos_db_4xx_request_time_aggregator | Monitor aggregator for Cosmos DB 4xx requests [available values: min, max or avg] | string | `min` | no |
|
||||||
| cosmos_db_4xx_request_timeframe | Monitor timeframe for Cosmos DB 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| cosmos_db_4xx_request_timeframe | Monitor timeframe for Cosmos DB 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| cosmos_db_4xx_requests_enabled | Flag to enable Cosmos DB 4xx requests monitor | string | `true` | no |
|
| cosmos_db_4xx_requests_enabled | Flag to enable Cosmos DB 4xx requests monitor | string | `true` | no |
|
||||||
| cosmos_db_4xx_requests_message | Custom message for Cosmos DB 4xx requests monitor | string | `` | no |
|
| cosmos_db_4xx_requests_message | Custom message for Cosmos DB 4xx requests monitor | string | `` | no |
|
||||||
@ -36,7 +36,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| cosmos_db_5xx_request_rate_extra_tags | Extra tags for Cosmos DB 5xx requests monitor | list | `[]` | no |
|
| cosmos_db_5xx_request_rate_extra_tags | Extra tags for Cosmos DB 5xx requests monitor | list | `[]` | no |
|
||||||
| cosmos_db_5xx_request_rate_threshold_critical | Critical threshold for Cosmos DB 5xx requests monitor | string | `80` | no |
|
| cosmos_db_5xx_request_rate_threshold_critical | Critical threshold for Cosmos DB 5xx requests monitor | string | `80` | no |
|
||||||
| cosmos_db_5xx_request_rate_threshold_warning | Warning threshold for Cosmos DB 5xx requests monitor | string | `50` | no |
|
| cosmos_db_5xx_request_rate_threshold_warning | Warning threshold for Cosmos DB 5xx requests monitor | string | `50` | no |
|
||||||
| cosmos_db_5xx_request_time_aggregator | Monitor aggregator for Cosmos DB 5xx requests [available values: min, max or avg] | string | `sum` | no |
|
| cosmos_db_5xx_request_time_aggregator | Monitor aggregator for Cosmos DB 5xx requests [available values: min, max or avg] | string | `min` | no |
|
||||||
| cosmos_db_5xx_request_timeframe | Monitor timeframe for Cosmos DB 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| cosmos_db_5xx_request_timeframe | Monitor timeframe for Cosmos DB 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| cosmos_db_5xx_requests_enabled | Flag to enable Cosmos DB 5xx requests monitor | string | `true` | no |
|
| cosmos_db_5xx_requests_enabled | Flag to enable Cosmos DB 5xx requests monitor | string | `true` | no |
|
||||||
| cosmos_db_5xx_requests_message | Custom message for Cosmos DB 5xx requests monitor | string | `` | no |
|
| cosmos_db_5xx_requests_message | Custom message for Cosmos DB 5xx requests monitor | string | `` | no |
|
||||||
@ -47,7 +47,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| cosmos_db_scaling_extra_tags | Extra tags for Cosmos DB scaling monitor | list | `[]` | no |
|
| cosmos_db_scaling_extra_tags | Extra tags for Cosmos DB scaling monitor | list | `[]` | no |
|
||||||
| cosmos_db_scaling_message | Custom message for Cosmos DB scaling monitor | string | `` | no |
|
| cosmos_db_scaling_message | Custom message for Cosmos DB scaling monitor | string | `` | no |
|
||||||
| cosmos_db_scaling_silenced | Groups to mute for Cosmos DB scaling monitor | map | `{}` | no |
|
| cosmos_db_scaling_silenced | Groups to mute for Cosmos DB scaling monitor | map | `{}` | no |
|
||||||
| cosmos_db_scaling_time_aggregator | Monitor aggregator for Cosmos DB scaling [available values: min, max or avg] | string | `sum` | no |
|
| cosmos_db_scaling_time_aggregator | Monitor aggregator for Cosmos DB scaling [available values: min, max or avg] | string | `min` | no |
|
||||||
| cosmos_db_scaling_timeframe | Monitor timeframe for Cosmos DB scaling [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| cosmos_db_scaling_timeframe | Monitor timeframe for Cosmos DB scaling [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| environment | Architecture environment | string | - | yes |
|
| environment | Architecture environment | string | - | yes |
|
||||||
| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||||
|
|||||||
@ -106,7 +106,7 @@ variable "cosmos_db_4xx_request_extra_tags" {
|
|||||||
variable "cosmos_db_4xx_request_time_aggregator" {
|
variable "cosmos_db_4xx_request_time_aggregator" {
|
||||||
description = "Monitor aggregator for Cosmos DB 4xx requests [available values: min, max or avg]"
|
description = "Monitor aggregator for Cosmos DB 4xx requests [available values: min, max or avg]"
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "sum"
|
default = "min"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "cosmos_db_4xx_request_timeframe" {
|
variable "cosmos_db_4xx_request_timeframe" {
|
||||||
@ -152,7 +152,7 @@ variable "cosmos_db_5xx_request_rate_extra_tags" {
|
|||||||
variable "cosmos_db_5xx_request_time_aggregator" {
|
variable "cosmos_db_5xx_request_time_aggregator" {
|
||||||
description = "Monitor aggregator for Cosmos DB 5xx requests [available values: min, max or avg]"
|
description = "Monitor aggregator for Cosmos DB 5xx requests [available values: min, max or avg]"
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "sum"
|
default = "min"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "cosmos_db_5xx_request_timeframe" {
|
variable "cosmos_db_5xx_request_timeframe" {
|
||||||
@ -198,7 +198,7 @@ variable "cosmos_db_scaling_extra_tags" {
|
|||||||
variable "cosmos_db_scaling_time_aggregator" {
|
variable "cosmos_db_scaling_time_aggregator" {
|
||||||
description = "Monitor aggregator for Cosmos DB scaling [available values: min, max or avg]"
|
description = "Monitor aggregator for Cosmos DB scaling [available values: min, max or avg]"
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "sum"
|
default = "min"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "cosmos_db_scaling_timeframe" {
|
variable "cosmos_db_scaling_timeframe" {
|
||||||
|
|||||||
@ -40,35 +40,31 @@ resource "datadog_monitor" "cosmos_db_4xx_requests" {
|
|||||||
|
|
||||||
# List of available status codes : https://docs.microsoft.com/en-us/rest/api/cosmos-db/http-status-codes-for-cosmosdb
|
# List of available status codes : https://docs.microsoft.com/en-us/rest/api/cosmos-db/http-status-codes-for-cosmosdb
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
${var.cosmos_db_4xx_request_time_aggregator}(${var.cosmos_db_4xx_request_timeframe}): (default(
|
${var.cosmos_db_4xx_request_time_aggregator}(${var.cosmos_db_4xx_request_timeframe}): ( (
|
||||||
(
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "400")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "400")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "401")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "401")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "403")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "403")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "404")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "404")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "408")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "408")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "409")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "409")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "412")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "412")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "413")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "413")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "449")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "449")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "400")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "400")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "401")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "401")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "403")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "403")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "404")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "404")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "408")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "408")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "409")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "409")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "412")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "412")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "413")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "413")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "449")} by {resource_group,region,name,collectionname}.as_rate(), 0) ) / (
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "449")} by {resource_group,region,name,collectionname}.as_count()
|
default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
) / (
|
default(sum:azure.documentdb_databaseaccounts.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 0) )
|
||||||
sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_count() +
|
) * 100 > ${var.cosmos_db_4xx_request_rate_threshold_critical}
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_count()
|
EOF
|
||||||
)
|
|
||||||
* 100, 0)
|
|
||||||
) > ${var.cosmos_db_4xx_request_rate_threshold_critical}
|
|
||||||
EOF
|
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -99,19 +95,15 @@ resource "datadog_monitor" "cosmos_db_5xx_requests" {
|
|||||||
message = "${coalesce(var.cosmos_db_5xx_requests_message, var.message)}"
|
message = "${coalesce(var.cosmos_db_5xx_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
${var.cosmos_db_5xx_request_time_aggregator}(${var.cosmos_db_5xx_request_timeframe}): (default(
|
${var.cosmos_db_5xx_request_time_aggregator}(${var.cosmos_db_5xx_request_timeframe}): ( (
|
||||||
(
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "500")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "500")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "503")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "503")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "500")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "500")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "503")} by {resource_group,region,name,collectionname}.as_rate(), 0) ) / (
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "503")} by {resource_group,region,name,collectionname}.as_count()
|
default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
) / (
|
default(sum:azure.documentdb_databaseaccounts.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 0) )
|
||||||
sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_count() +
|
) * 100 > ${var.cosmos_db_5xx_request_rate_threshold_critical}
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_count()
|
EOF
|
||||||
)
|
|
||||||
* 100, 0)
|
|
||||||
) > ${var.cosmos_db_5xx_request_rate_threshold_critical}
|
|
||||||
EOF
|
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -143,17 +135,13 @@ resource "datadog_monitor" "cosmos_db_scaling" {
|
|||||||
|
|
||||||
# List of available status codes : https://docs.microsoft.com/en-us/rest/api/cosmos-db/http-status-codes-for-cosmosdb
|
# List of available status codes : https://docs.microsoft.com/en-us/rest/api/cosmos-db/http-status-codes-for-cosmosdb
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
${var.cosmos_db_scaling_time_aggregator}(${var.cosmos_db_scaling_timeframe}): (default(
|
${var.cosmos_db_scaling_time_aggregator}(${var.cosmos_db_scaling_timeframe}): ( (
|
||||||
(
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,collectionname}.as_count() +
|
default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,collectionname}.as_rate(), 0) ) / (
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,collectionname}.as_count()
|
default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
) / (
|
default(sum:azure.documentdb_databaseaccounts.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 0) )
|
||||||
sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_count() +
|
) * 100 > ${var.cosmos_db_scaling_error_rate_threshold_critical}
|
||||||
sum:azure.documentdb_databaseaccounts.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_count()
|
EOF
|
||||||
)
|
|
||||||
* 100, 0)
|
|
||||||
) > ${var.cosmos_db_scaling_error_rate_threshold_critical}
|
|
||||||
EOF
|
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -31,6 +31,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| errors_rate_silenced | Groups to mute for Event Hub errors monitor | map | `{}` | no |
|
| errors_rate_silenced | Groups to mute for Event Hub errors monitor | map | `{}` | no |
|
||||||
| errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `90` | no |
|
| errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `90` | no |
|
||||||
| errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `50` | no |
|
| errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `50` | no |
|
||||||
|
| errors_rate_time_aggregator | Monitor aggregator for Event Hub errors [available values: min, max or avg] | string | `min` | no |
|
||||||
| errors_rate_timeframe | Monitor timeframe for Event Hub errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| errors_rate_timeframe | Monitor timeframe for Event Hub errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||||
| failed_requests_rate_enabled | Flag to enable Event Hub failed requests monitor | string | `true` | no |
|
| failed_requests_rate_enabled | Flag to enable Event Hub failed requests monitor | string | `true` | no |
|
||||||
@ -39,6 +40,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| failed_requests_rate_silenced | Groups to mute for Event Hub failed requests monitor | map | `{}` | no |
|
| failed_requests_rate_silenced | Groups to mute for Event Hub failed requests monitor | map | `{}` | no |
|
||||||
| failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `90` | no |
|
| failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `90` | no |
|
||||||
| failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `50` | no |
|
| failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `50` | no |
|
||||||
|
| failed_requests_rate_time_aggregator | Monitor aggregator for Event Hub failed requests [available values: min, max or avg] | string | `min` | no |
|
||||||
| failed_requests_rate_timeframe | Monitor timeframe for Event Hub failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| failed_requests_rate_timeframe | Monitor timeframe for Event Hub failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||||
| filter_tags_custom_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `` | no |
|
| filter_tags_custom_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `` | no |
|
||||||
|
|||||||
@ -95,6 +95,12 @@ variable "failed_requests_rate_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "failed_requests_rate_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for Event Hub failed requests [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "failed_requests_rate_timeframe" {
|
variable "failed_requests_rate_timeframe" {
|
||||||
description = "Monitor timeframe for Event Hub failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for Event Hub failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -135,6 +141,12 @@ variable "errors_rate_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "errors_rate_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for Event Hub errors [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "errors_rate_timeframe" {
|
variable "errors_rate_timeframe" {
|
||||||
description = "Monitor timeframe for Event Hub errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for Event Hub errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
|
|||||||
@ -32,12 +32,10 @@ resource "datadog_monitor" "eventhub_failed_requests" {
|
|||||||
message = "${coalesce(var.failed_requests_rate_message, var.message)}"
|
message = "${coalesce(var.failed_requests_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.failed_requests_rate_timeframe}): (
|
${var.failed_requests_rate_time_aggregator}(${var.failed_requests_rate_timeframe}): (
|
||||||
default(
|
default(avg:azure.eventhub_namespaces.failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
avg:azure.eventhub_namespaces.failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() /
|
default(avg:azure.eventhub_namespaces.incoming_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
avg:azure.eventhub_namespaces.incoming_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count(),
|
) * 100 > ${var.failed_requests_rate_thresold_critical}
|
||||||
0) * 100
|
|
||||||
) > ${var.failed_requests_rate_thresold_critical}
|
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
@ -68,17 +66,12 @@ resource "datadog_monitor" "eventhub_errors" {
|
|||||||
message = "${coalesce(var.errors_rate_message, var.message)}"
|
message = "${coalesce(var.errors_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.errors_rate_timeframe}): (
|
${var.errors_rate_time_aggregator}(${var.errors_rate_timeframe}): ( (
|
||||||
default(
|
default(avg:azure.eventhub_namespaces.internal_server_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
(
|
default(avg:azure.eventhub_namespaces.server_busy_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.eventhub_namespaces.internal_server_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() +
|
default(avg:azure.eventhub_namespaces.other_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) ) /
|
||||||
avg:azure.eventhub_namespaces.server_busy_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() +
|
default(avg:azure.eventhub_namespaces.incoming_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
avg:azure.eventhub_namespaces.other_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_count()
|
) * 100 > ${var.errors_rate_thresold_critical}
|
||||||
) / (
|
|
||||||
avg:eventhub_namespaces.incoming_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count()
|
|
||||||
),
|
|
||||||
0) * 100
|
|
||||||
) > ${var.errors_rate_thresold_critical}
|
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|||||||
@ -41,6 +41,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| dropped_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `90` | no |
|
| dropped_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `90` | no |
|
||||||
| dropped_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `50` | no |
|
| dropped_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `50` | no |
|
||||||
| dropped_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub dropped d2c telemetry monitor | map | `{}` | no |
|
| dropped_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub dropped d2c telemetry monitor | map | `{}` | no |
|
||||||
|
| dropped_d2c_telemetry_egress_time_aggregator | Monitor aggregator for IoT Hub dropped d2c telemetry [available values: min, max, sum or avg] | string | `min` | no |
|
||||||
| dropped_d2c_telemetry_egress_timeframe | Monitor timeframe for IoT Hub dropped d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| dropped_d2c_telemetry_egress_timeframe | Monitor timeframe for IoT Hub dropped d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| environment | Architecture Environment | string | - | yes |
|
| environment | Architecture Environment | string | - | yes |
|
||||||
| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||||
@ -50,6 +51,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| failed_c2d_methods_rate_silenced | Groups to mute for IoT Hub failed c2d methods monitor | map | `{}` | no |
|
| failed_c2d_methods_rate_silenced | Groups to mute for IoT Hub failed c2d methods monitor | map | `{}` | no |
|
||||||
| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `90` | no |
|
| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `90` | no |
|
||||||
| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `50` | no |
|
| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `50` | no |
|
||||||
|
| failed_c2d_methods_rate_time_aggregator | Monitor aggregator for IoT Hub failed c2d method [available values: min, max, sum or avg] | string | `min` | no |
|
||||||
| failed_c2d_methods_rate_timeframe | Monitor timeframe for IoT Hub failed c2d method [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| failed_c2d_methods_rate_timeframe | Monitor timeframe for IoT Hub failed c2d method [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| failed_c2d_twin_read_rate_enabled | Flag to enable IoT Hub failed c2d twin read monitor | string | `true` | no |
|
| failed_c2d_twin_read_rate_enabled | Flag to enable IoT Hub failed c2d twin read monitor | string | `true` | no |
|
||||||
| failed_c2d_twin_read_rate_extra_tags | Extra tags for IoT Hub failed c2d twin read monitor | list | `[]` | no |
|
| failed_c2d_twin_read_rate_extra_tags | Extra tags for IoT Hub failed c2d twin read monitor | list | `[]` | no |
|
||||||
@ -57,6 +59,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| failed_c2d_twin_read_rate_silenced | Groups to mute for IoT Hub failed c2d twin read monitor | map | `{}` | no |
|
| failed_c2d_twin_read_rate_silenced | Groups to mute for IoT Hub failed c2d twin read monitor | map | `{}` | no |
|
||||||
| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `90` | no |
|
| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `90` | no |
|
||||||
| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `50` | no |
|
| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `50` | no |
|
||||||
|
| failed_c2d_twin_read_rate_time_aggregator | Monitor aggregator for IoT Hub failed c2d twin read [available values: min, max, sum or avg] | string | `min` | no |
|
||||||
| failed_c2d_twin_read_rate_timeframe | Monitor timeframe for IoT Hub failed c2d twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| failed_c2d_twin_read_rate_timeframe | Monitor timeframe for IoT Hub failed c2d twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| failed_c2d_twin_update_rate_enabled | Flag to enable IoT Hub failed c2d twin update monitor | string | `true` | no |
|
| failed_c2d_twin_update_rate_enabled | Flag to enable IoT Hub failed c2d twin update monitor | string | `true` | no |
|
||||||
| failed_c2d_twin_update_rate_extra_tags | Extra tags for IoT Hub failed c2d twin update monitor | list | `[]` | no |
|
| failed_c2d_twin_update_rate_extra_tags | Extra tags for IoT Hub failed c2d twin update monitor | list | `[]` | no |
|
||||||
@ -64,6 +67,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| failed_c2d_twin_update_rate_silenced | Groups to mute for IoT Hub failed c2d twin update monitor | map | `{}` | no |
|
| failed_c2d_twin_update_rate_silenced | Groups to mute for IoT Hub failed c2d twin update monitor | map | `{}` | no |
|
||||||
| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `90` | no |
|
| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `90` | no |
|
||||||
| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `50` | no |
|
| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `50` | no |
|
||||||
|
| failed_c2d_twin_update_rate_time_aggregator | Monitor aggregator for IoT Hub failed c2d twin update [available values: min, max, sum or avg] | string | `min` | no |
|
||||||
| failed_c2d_twin_update_rate_timeframe | Monitor timeframe for IoT Hub failed c2d twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| failed_c2d_twin_update_rate_timeframe | Monitor timeframe for IoT Hub failed c2d twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| failed_d2c_twin_read_rate_enabled | Flag to enable IoT Hub failed d2c twin read monitor | string | `true` | no |
|
| failed_d2c_twin_read_rate_enabled | Flag to enable IoT Hub failed d2c twin read monitor | string | `true` | no |
|
||||||
| failed_d2c_twin_read_rate_extra_tags | Extra tags for IoT Hub failed d2c twin read monitor | list | `[]` | no |
|
| failed_d2c_twin_read_rate_extra_tags | Extra tags for IoT Hub failed d2c twin read monitor | list | `[]` | no |
|
||||||
@ -71,6 +75,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| failed_d2c_twin_read_rate_silenced | Groups to mute for IoT Hub failed d2c twin read monitor | map | `{}` | no |
|
| failed_d2c_twin_read_rate_silenced | Groups to mute for IoT Hub failed d2c twin read monitor | map | `{}` | no |
|
||||||
| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `90` | no |
|
| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `90` | no |
|
||||||
| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `50` | no |
|
| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `50` | no |
|
||||||
|
| failed_d2c_twin_read_rate_time_aggregator | Monitor aggregator for IoT Hub failed d2c twin read [available values: min, max, sum or avg] | string | `min` | no |
|
||||||
| failed_d2c_twin_read_rate_timeframe | Monitor timeframe for IoT Hub failed d2c twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| failed_d2c_twin_read_rate_timeframe | Monitor timeframe for IoT Hub failed d2c twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| failed_d2c_twin_update_rate_enabled | Flag to enable IoT Hub failed d2c twin update monitor | string | `true` | no |
|
| failed_d2c_twin_update_rate_enabled | Flag to enable IoT Hub failed d2c twin update monitor | string | `true` | no |
|
||||||
| failed_d2c_twin_update_rate_extra_tags | Extra tags for IoT Hub failed d2c twin update monitor | list | `[]` | no |
|
| failed_d2c_twin_update_rate_extra_tags | Extra tags for IoT Hub failed d2c twin update monitor | list | `[]` | no |
|
||||||
@ -78,6 +83,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| failed_d2c_twin_update_rate_silenced | Groups to mute for IoT Hub failed d2c twin update monitor | map | `{}` | no |
|
| failed_d2c_twin_update_rate_silenced | Groups to mute for IoT Hub failed d2c twin update monitor | map | `{}` | no |
|
||||||
| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `90` | no |
|
| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `90` | no |
|
||||||
| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `50` | no |
|
| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `50` | no |
|
||||||
|
| failed_d2c_twin_update_rate_time_aggregator | Monitor aggregator for IoT Hub failed d2c twin update [available values: min, max, sum or avg] | string | `min` | no |
|
||||||
| failed_d2c_twin_update_rate_timeframe | Monitor timeframe for IoT Hub failed d2c twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| failed_d2c_twin_update_rate_timeframe | Monitor timeframe for IoT Hub failed d2c twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| failed_jobs_rate_enabled | Flag to enable IoT Hub failed jobs monitor | string | `true` | no |
|
| failed_jobs_rate_enabled | Flag to enable IoT Hub failed jobs monitor | string | `true` | no |
|
||||||
| failed_jobs_rate_extra_tags | Extra tags for IoT Hub failed jobs monitor | list | `[]` | no |
|
| failed_jobs_rate_extra_tags | Extra tags for IoT Hub failed jobs monitor | list | `[]` | no |
|
||||||
@ -85,6 +91,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| failed_jobs_rate_silenced | Groups to mute for IoT Hub failed jobs monitor | map | `{}` | no |
|
| failed_jobs_rate_silenced | Groups to mute for IoT Hub failed jobs monitor | map | `{}` | no |
|
||||||
| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `90` | no |
|
| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `90` | no |
|
||||||
| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `50` | no |
|
| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `50` | no |
|
||||||
|
| failed_jobs_rate_time_aggregator | Monitor aggregator for IoT Hub failed jobs [available values: min, max, sum or avg] | string | `min` | no |
|
||||||
| failed_jobs_rate_timeframe | Monitor timeframe for IoT Hub failed jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| failed_jobs_rate_timeframe | Monitor timeframe for IoT Hub failed jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| failed_listjobs_rate_enabled | Flag to enable IoT Hub failed list jobs monitor | string | `true` | no |
|
| failed_listjobs_rate_enabled | Flag to enable IoT Hub failed list jobs monitor | string | `true` | no |
|
||||||
| failed_listjobs_rate_extra_tags | Extra tags for IoT Hub failed list jobs monitor | list | `[]` | no |
|
| failed_listjobs_rate_extra_tags | Extra tags for IoT Hub failed list jobs monitor | list | `[]` | no |
|
||||||
@ -92,6 +99,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| failed_listjobs_rate_silenced | Groups to mute for IoT Hub failed list jobs monitor | map | `{}` | no |
|
| failed_listjobs_rate_silenced | Groups to mute for IoT Hub failed list jobs monitor | map | `{}` | no |
|
||||||
| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `90` | no |
|
| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `90` | no |
|
||||||
| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `50` | no |
|
| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `50` | no |
|
||||||
|
| failed_listjobs_rate_time_aggregator | Monitor aggregator for IoT Hub failed list jobs [available values: min, max, sum or avg] | string | `min` | no |
|
||||||
| failed_listjobs_rate_timeframe | Monitor timeframe for IoT Hub failed list jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| failed_listjobs_rate_timeframe | Monitor timeframe for IoT Hub failed list jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| failed_queryjobs_rate_enabled | Flag to enable IoT Hub failed query jobs monitor | string | `true` | no |
|
| failed_queryjobs_rate_enabled | Flag to enable IoT Hub failed query jobs monitor | string | `true` | no |
|
||||||
| failed_queryjobs_rate_extra_tags | Extra tags for IoT Hub failed query jobs monitor | list | `[]` | no |
|
| failed_queryjobs_rate_extra_tags | Extra tags for IoT Hub failed query jobs monitor | list | `[]` | no |
|
||||||
@ -99,6 +107,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| failed_queryjobs_rate_silenced | Groups to mute for IoT Hub failed query jobs monitor | map | `{}` | no |
|
| failed_queryjobs_rate_silenced | Groups to mute for IoT Hub failed query jobs monitor | map | `{}` | no |
|
||||||
| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `90` | no |
|
| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `90` | no |
|
||||||
| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no |
|
| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no |
|
||||||
|
| failed_queryjobs_rate_time_aggregator | Monitor aggregator for IoT Hub failed query jobs [available values: min, max, sum or avg] | string | `min` | no |
|
||||||
| failed_queryjobs_rate_timeframe | Monitor timeframe for IoT Hub failed query jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| failed_queryjobs_rate_timeframe | Monitor timeframe for IoT Hub failed query jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| filter_tags | Tags used for filtering | string | `*` | no |
|
| filter_tags | Tags used for filtering | string | `*` | no |
|
||||||
| invalid_d2c_telemetry_egress_enabled | Flag to enable IoT Hub invalid d2c telemetry monitor | string | `true` | no |
|
| invalid_d2c_telemetry_egress_enabled | Flag to enable IoT Hub invalid d2c telemetry monitor | string | `true` | no |
|
||||||
@ -107,6 +116,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| invalid_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `90` | no |
|
| invalid_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `90` | no |
|
||||||
| invalid_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `50` | no |
|
| invalid_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `50` | no |
|
||||||
| invalid_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub invalid d2c telemetry monitor | map | `{}` | no |
|
| invalid_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub invalid d2c telemetry monitor | map | `{}` | no |
|
||||||
|
| invalid_d2c_telemetry_egress_time_aggregator | Monitor aggregator for IoT Hub invalid d2c telemetry [available values: min, max, sum or avg] | string | `min` | no |
|
||||||
| invalid_d2c_telemetry_egress_timeframe | Monitor timeframe for IoT Hub invalid d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| invalid_d2c_telemetry_egress_timeframe | Monitor timeframe for IoT Hub invalid d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| message | Message sent when an alert is triggered | string | - | yes |
|
| message | Message sent when an alert is triggered | string | - | yes |
|
||||||
| new_host_delay | Delay in seconds before monitor new resource | string | `300` | no |
|
| new_host_delay | Delay in seconds before monitor new resource | string | `300` | no |
|
||||||
@ -116,6 +126,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| orphaned_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `90` | no |
|
| orphaned_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `90` | no |
|
||||||
| orphaned_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `50` | no |
|
| orphaned_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `50` | no |
|
||||||
| orphaned_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub orphaned d2c telemetry monitor | map | `{}` | no |
|
| orphaned_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub orphaned d2c telemetry monitor | map | `{}` | no |
|
||||||
|
| orphaned_d2c_telemetry_egress_time_aggregator | Monitor aggregator for IoT Hub orphaned d2c telemetry [available values: min, max, sum or avg] | string | `min` | no |
|
||||||
| orphaned_d2c_telemetry_egress_timeframe | Monitor timeframe for IoT Hub orphaned d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| orphaned_d2c_telemetry_egress_timeframe | Monitor timeframe for IoT Hub orphaned d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| status_enabled | Flag to enable IoT Hub status monitor | string | `true` | no |
|
| status_enabled | Flag to enable IoT Hub status monitor | string | `true` | no |
|
||||||
| status_extra_tags | Extra tags for IoT Hub status monitor | list | `[]` | no |
|
| status_extra_tags | Extra tags for IoT Hub status monitor | list | `[]` | no |
|
||||||
|
|||||||
@ -151,6 +151,12 @@ variable "failed_jobs_rate_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "failed_jobs_rate_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for IoT Hub failed jobs [available values: min, max, sum or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "failed_jobs_rate_timeframe" {
|
variable "failed_jobs_rate_timeframe" {
|
||||||
description = "Monitor timeframe for IoT Hub failed jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for IoT Hub failed jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -191,6 +197,12 @@ variable "failed_listjobs_rate_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "failed_listjobs_rate_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for IoT Hub failed list jobs [available values: min, max, sum or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "failed_listjobs_rate_timeframe" {
|
variable "failed_listjobs_rate_timeframe" {
|
||||||
description = "Monitor timeframe for IoT Hub failed list jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for IoT Hub failed list jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -231,6 +243,12 @@ variable "failed_queryjobs_rate_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "failed_queryjobs_rate_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for IoT Hub failed query jobs [available values: min, max, sum or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "failed_queryjobs_rate_timeframe" {
|
variable "failed_queryjobs_rate_timeframe" {
|
||||||
description = "Monitor timeframe for IoT Hub failed query jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for IoT Hub failed query jobs [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -271,6 +289,12 @@ variable "failed_c2d_methods_rate_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "failed_c2d_methods_rate_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for IoT Hub failed c2d method [available values: min, max, sum or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "failed_c2d_methods_rate_timeframe" {
|
variable "failed_c2d_methods_rate_timeframe" {
|
||||||
description = "Monitor timeframe for IoT Hub failed c2d method [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for IoT Hub failed c2d method [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -311,6 +335,12 @@ variable "failed_c2d_twin_read_rate_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "failed_c2d_twin_read_rate_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for IoT Hub failed c2d twin read [available values: min, max, sum or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "failed_c2d_twin_read_rate_timeframe" {
|
variable "failed_c2d_twin_read_rate_timeframe" {
|
||||||
description = "Monitor timeframe for IoT Hub failed c2d twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for IoT Hub failed c2d twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -351,6 +381,12 @@ variable "failed_c2d_twin_update_rate_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "failed_c2d_twin_update_rate_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for IoT Hub failed c2d twin update [available values: min, max, sum or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "failed_c2d_twin_update_rate_timeframe" {
|
variable "failed_c2d_twin_update_rate_timeframe" {
|
||||||
description = "Monitor timeframe for IoT Hub failed c2d twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for IoT Hub failed c2d twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -391,6 +427,12 @@ variable "failed_d2c_twin_read_rate_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "failed_d2c_twin_read_rate_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for IoT Hub failed d2c twin read [available values: min, max, sum or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "failed_d2c_twin_read_rate_timeframe" {
|
variable "failed_d2c_twin_read_rate_timeframe" {
|
||||||
description = "Monitor timeframe for IoT Hub failed d2c twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for IoT Hub failed d2c twin read [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -431,6 +473,12 @@ variable "failed_d2c_twin_update_rate_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "failed_d2c_twin_update_rate_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for IoT Hub failed d2c twin update [available values: min, max, sum or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "failed_d2c_twin_update_rate_timeframe" {
|
variable "failed_d2c_twin_update_rate_timeframe" {
|
||||||
description = "Monitor timeframe for IoT Hub failed d2c twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for IoT Hub failed d2c twin update [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -471,6 +519,12 @@ variable "dropped_d2c_telemetry_egress_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "dropped_d2c_telemetry_egress_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for IoT Hub dropped d2c telemetry [available values: min, max, sum or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "dropped_d2c_telemetry_egress_timeframe" {
|
variable "dropped_d2c_telemetry_egress_timeframe" {
|
||||||
description = "Monitor timeframe for IoT Hub dropped d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for IoT Hub dropped d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -511,6 +565,12 @@ variable "orphaned_d2c_telemetry_egress_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "orphaned_d2c_telemetry_egress_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for IoT Hub orphaned d2c telemetry [available values: min, max, sum or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "orphaned_d2c_telemetry_egress_timeframe" {
|
variable "orphaned_d2c_telemetry_egress_timeframe" {
|
||||||
description = "Monitor timeframe for IoT Hub orphaned d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for IoT Hub orphaned d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -551,6 +611,12 @@ variable "invalid_d2c_telemetry_egress_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "invalid_d2c_telemetry_egress_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for IoT Hub invalid d2c telemetry [available values: min, max, sum or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "invalid_d2c_telemetry_egress_timeframe" {
|
variable "invalid_d2c_telemetry_egress_timeframe" {
|
||||||
description = "Monitor timeframe for IoT Hub invalid d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for IoT Hub invalid d2c telemetry [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
|
|||||||
@ -4,11 +4,11 @@ resource "datadog_monitor" "too_many_jobs_failed" {
|
|||||||
message = "${coalesce(var.failed_jobs_rate_message, var.message)}"
|
message = "${coalesce(var.failed_jobs_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.failed_jobs_rate_timeframe}):(
|
${var.failed_jobs_rate_time_aggregator}(${var.failed_jobs_rate_timeframe}):(
|
||||||
avg:azure.devices_iothubs.jobs.failed{${var.filter_tags}} by {resource_group,region,name}.as_count() /
|
default(avg:azure.devices_iothubs.jobs.failed{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
( avg:azure.devices_iothubs.jobs.failed{${var.filter_tags}} by {resource_group,region,name}.as_count() +
|
default(avg:azure.devices_iothubs.jobs.failed{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.jobs.completed{${var.filter_tags}} by {resource_group,region,name}.as_count() )
|
default(avg:azure.devices_iothubs.jobs.completed{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
) * 100 > ${var.failed_jobs_rate_threshold_critical}
|
) * 100 > ${var.failed_jobs_rate_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
@ -39,11 +39,11 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
|
|||||||
message = "${coalesce(var.failed_listjobs_rate_message, var.message)}"
|
message = "${coalesce(var.failed_listjobs_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.failed_listjobs_rate_timeframe}):(
|
${var.failed_listjobs_rate_time_aggregator}(${var.failed_listjobs_rate_timeframe}):(
|
||||||
avg:azure.devices_iothubs.jobs.list_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_count() /
|
default(avg:azure.devices_iothubs.jobs.list_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_rate(), 0) / (
|
||||||
( avg:azure.devices_iothubs.jobs.list_jobs.success{${var.filter_tags}} by {resource_group,name}.as_count() +
|
default(avg:azure.devices_iothubs.jobs.list_jobs.success{${var.filter_tags}} by {resource_group,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.jobs.list_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_count() )
|
default(avg:azure.devices_iothubs.jobs.list_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_rate(), 0) )
|
||||||
) * 100 > ${var.failed_listjobs_rate_threshold_critical}
|
) * 100 > ${var.failed_listjobs_rate_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
@ -74,10 +74,10 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
|
|||||||
message = "${coalesce(var.failed_queryjobs_rate_message, var.message)}"
|
message = "${coalesce(var.failed_queryjobs_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.failed_queryjobs_rate_timeframe}):(
|
${var.failed_queryjobs_rate_time_aggregator}(${var.failed_queryjobs_rate_timeframe}):(
|
||||||
avg:azure.devices_iothubs.jobs.query_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_count() /
|
default(avg:azure.devices_iothubs.jobs.query_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_rate(), 0) / (
|
||||||
( avg:azure.devices_iothubs.jobs.query_jobs.success{${var.filter_tags}} by {resource_group,name}.as_count() +
|
default(avg:azure.devices_iothubs.jobs.query_jobs.success{${var.filter_tags}} by {resource_group,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.jobs.query_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_count() )
|
default(avg:azure.devices_iothubs.jobs.query_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_rate(), 0) )
|
||||||
) * 100 > ${var.failed_queryjobs_rate_threshold_critical}
|
) * 100 > ${var.failed_queryjobs_rate_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -165,10 +165,10 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
|
|||||||
message = "${coalesce(var.failed_c2d_methods_rate_message, var.message)}"
|
message = "${coalesce(var.failed_c2d_methods_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.failed_c2d_methods_rate_timeframe}):(
|
${var.failed_c2d_methods_rate_time_aggregator}(${var.failed_c2d_methods_rate_timeframe}):(
|
||||||
avg:azure.devices_iothubs.c2d.methods.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() /
|
default(avg:azure.devices_iothubs.c2d.methods.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
( avg:azure.devices_iothubs.c2d.methods.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() +
|
default(avg:azure.devices_iothubs.c2d.methods.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.c2d.methods.success{${var.filter_tags}} by {resource_group,region,name}.as_count() )
|
default(avg:azure.devices_iothubs.c2d.methods.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
) * 100 > ${var.failed_c2d_methods_rate_threshold_critical}
|
) * 100 > ${var.failed_c2d_methods_rate_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -200,10 +200,10 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
|
|||||||
message = "${coalesce(var.failed_c2d_twin_read_rate_message, var.message)}"
|
message = "${coalesce(var.failed_c2d_twin_read_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.failed_c2d_twin_read_rate_timeframe}):(
|
${var.failed_c2d_twin_read_rate_time_aggregator}(${var.failed_c2d_twin_read_rate_timeframe}):(
|
||||||
avg:azure.devices_iothubs.c2d.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() /
|
default(avg:azure.devices_iothubs.c2d.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
( avg:azure.devices_iothubs.c2d.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() +
|
default(avg:azure.devices_iothubs.c2d.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.c2d.twin.read.success{${var.filter_tags}} by {resource_group,region,name}.as_count() )
|
default(avg:azure.devices_iothubs.c2d.twin.read.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
) * 100 > ${var.failed_c2d_twin_read_rate_threshold_critical}
|
) * 100 > ${var.failed_c2d_twin_read_rate_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -235,10 +235,10 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
|
|||||||
message = "${coalesce(var.failed_c2d_twin_update_rate_message, var.message)}"
|
message = "${coalesce(var.failed_c2d_twin_update_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.failed_c2d_twin_update_rate_timeframe}):(
|
${var.failed_c2d_twin_update_rate_time_aggregator}(${var.failed_c2d_twin_update_rate_timeframe}):(
|
||||||
avg:azure.devices_iothubs.c2d.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() /
|
default(avg:azure.devices_iothubs.c2d.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
( avg:azure.devices_iothubs.c2d.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() +
|
default(avg:azure.devices_iothubs.c2d.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.c2d.twin.update.success{${var.filter_tags}} by {resource_group,region,name}.as_count() )
|
default(avg:azure.devices_iothubs.c2d.twin.update.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
) * 100 > ${var.failed_c2d_twin_update_rate_threshold_critical}
|
) * 100 > ${var.failed_c2d_twin_update_rate_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -270,10 +270,10 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
|
|||||||
message = "${coalesce(var.failed_d2c_twin_read_rate_message, var.message)}"
|
message = "${coalesce(var.failed_d2c_twin_read_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.failed_d2c_twin_read_rate_timeframe}):(
|
${var.failed_d2c_twin_read_rate_time_aggregator}(${var.failed_d2c_twin_read_rate_timeframe}):(
|
||||||
avg:azure.devices_iothubs.d2c.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() /
|
default(avg:azure.devices_iothubs.d2c.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
( avg:azure.devices_iothubs.d2c.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() +
|
default(avg:azure.devices_iothubs.d2c.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.d2c.twin.read.success{${var.filter_tags}} by {resource_group,region,name}.as_count() )
|
default(avg:azure.devices_iothubs.d2c.twin.read.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
) * 100 > ${var.failed_d2c_twin_read_rate_threshold_critical}
|
) * 100 > ${var.failed_d2c_twin_read_rate_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -305,10 +305,10 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
|
|||||||
message = "${coalesce(var.failed_d2c_twin_update_rate_message, var.message)}"
|
message = "${coalesce(var.failed_d2c_twin_update_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.failed_d2c_twin_update_rate_timeframe}):(
|
${var.failed_d2c_twin_update_rate_time_aggregator}(${var.failed_d2c_twin_update_rate_timeframe}):(
|
||||||
avg:azure.devices_iothubs.d2c.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() /
|
default(avg:azure.devices_iothubs.d2c.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
( avg:azure.devices_iothubs.d2c.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_count() +
|
default(avg:azure.devices_iothubs.d2c.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.d2c.twin.update.success{${var.filter_tags}} by {resource_group,region,name}.as_count() )
|
default(avg:azure.devices_iothubs.d2c.twin.update.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
) * 100 > ${var.failed_d2c_twin_update_rate_threshold_critical}
|
) * 100 > ${var.failed_d2c_twin_update_rate_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -340,14 +340,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
|
|||||||
message = "${coalesce(var.dropped_d2c_telemetry_egress_message, var.message)}"
|
message = "${coalesce(var.dropped_d2c_telemetry_egress_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.dropped_d2c_telemetry_egress_timeframe}): (
|
${var.dropped_d2c_telemetry_egress_time_aggregator}(${var.dropped_d2c_telemetry_egress_timeframe}): (
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_count() /
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_count() +
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_count() +
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_count() +
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_count())
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
* 100
|
) * 100 > ${var.dropped_d2c_telemetry_egress_rate_threshold_critical}
|
||||||
) > ${var.dropped_d2c_telemetry_egress_rate_threshold_critical}
|
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
@ -378,14 +377,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
|
|||||||
message = "${coalesce(var.orphaned_d2c_telemetry_egress_message, var.message)}"
|
message = "${coalesce(var.orphaned_d2c_telemetry_egress_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.orphaned_d2c_telemetry_egress_timeframe}): (
|
${var.orphaned_d2c_telemetry_egress_time_aggregator}(${var.orphaned_d2c_telemetry_egress_timeframe}): (
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_count() /
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_count() +
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_count() +
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_count() +
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_count())
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
* 100
|
) * 100 > ${var.orphaned_d2c_telemetry_egress_rate_threshold_critical}
|
||||||
) > ${var.orphaned_d2c_telemetry_egress_rate_threshold_critical}
|
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
@ -416,14 +414,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
|
|||||||
message = "${coalesce(var.invalid_d2c_telemetry_egress_message, var.message)}"
|
message = "${coalesce(var.invalid_d2c_telemetry_egress_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.invalid_d2c_telemetry_egress_timeframe}): (
|
${var.invalid_d2c_telemetry_egress_time_aggregator}(${var.invalid_d2c_telemetry_egress_timeframe}): (
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_count() /
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_count() +
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_count() +
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_count() +
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_count())
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
* 100
|
) * 100 > ${var.invalid_d2c_telemetry_egress_rate_threshold_critical}
|
||||||
) > ${var.invalid_d2c_telemetry_egress_rate_threshold_critical}
|
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
@ -456,7 +453,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
|
|||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.too_many_d2c_telemetry_ingress_nosent_timeframe}): (
|
sum(${var.too_many_d2c_telemetry_ingress_nosent_timeframe}): (
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{${var.filter_tags}} by {resource_group,region,name}.as_count() -
|
avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol{${var.filter_tags}} by {resource_group,region,name}.as_count() -
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.ingress.success{${var.filter_tags}} by {resource_group,region,name}.as_count()
|
avg:azure.devices_iothubs.d2c.telemetry.ingress.success{${var.filter_tags}} by {resource_group,region,name}.as_count()
|
||||||
) > 0
|
) > 0
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
|||||||
@ -38,7 +38,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| api_result_silenced | Groups to mute for Key Vault API result monitor | map | `{}` | no |
|
| api_result_silenced | Groups to mute for Key Vault API result monitor | map | `{}` | no |
|
||||||
| api_result_threshold_critical | Critical threshold for Key Vault API result rate | string | `10` | no |
|
| api_result_threshold_critical | Critical threshold for Key Vault API result rate | string | `10` | no |
|
||||||
| api_result_threshold_warning | Warning threshold for Key Vault API result rate | string | `30` | no |
|
| api_result_threshold_warning | Warning threshold for Key Vault API result rate | string | `30` | no |
|
||||||
| api_result_time_aggregator | Monitor aggregator for Key Vault API result [available values: min, max or avg] | string | `sum` | no |
|
| api_result_time_aggregator | Monitor aggregator for Key Vault API result [available values: min, max or avg] | string | `max` | no |
|
||||||
| api_result_timeframe | Monitor timeframe for Key Vault API result [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| api_result_timeframe | Monitor timeframe for Key Vault API result [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| environment | Architecture environment | string | - | yes |
|
| environment | Architecture environment | string | - | yes |
|
||||||
| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||||
|
|||||||
@ -89,7 +89,7 @@ variable "api_result_message" {
|
|||||||
variable "api_result_time_aggregator" {
|
variable "api_result_time_aggregator" {
|
||||||
description = "Monitor aggregator for Key Vault API result [available values: min, max or avg]"
|
description = "Monitor aggregator for Key Vault API result [available values: min, max or avg]"
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "sum"
|
default = "max"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "api_result_timeframe" {
|
variable "api_result_timeframe" {
|
||||||
|
|||||||
@ -35,8 +35,8 @@ resource "datadog_monitor" "keyvault_api_result" {
|
|||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
${var.api_result_time_aggregator}(${var.api_result_timeframe}): (
|
${var.api_result_time_aggregator}(${var.api_result_timeframe}): (
|
||||||
avg:azure.keyvault_vaults.service_api_result${format(module.filter-tags-statuscode.query_alert, "200")} by {name,resource_group,region}.as_count() /
|
default(avg:azure.keyvault_vaults.service_api_result${format(module.filter-tags-statuscode.query_alert, "200")} by {name,resource_group,region}.as_rate(), 1) /
|
||||||
avg:azure.keyvault_vaults.service_api_result${module.filter-tags.query_alert} by {name,resource_group,region}.as_count()
|
default(avg:azure.keyvault_vaults.service_api_result${module.filter-tags.query_alert} by {name,resource_group,region}.as_rate(), 1)
|
||||||
) * 100 < ${var.api_result_threshold_critical}
|
) * 100 < ${var.api_result_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
|||||||
@ -42,6 +42,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| server_errors_silenced | Groups to mute for Service Bus server errors monitor | map | `{}` | no |
|
| server_errors_silenced | Groups to mute for Service Bus server errors monitor | map | `{}` | no |
|
||||||
| server_errors_threshold_critical | Critical threshold for Service Bus server errors monitor | string | `90` | no |
|
| server_errors_threshold_critical | Critical threshold for Service Bus server errors monitor | string | `90` | no |
|
||||||
| server_errors_threshold_warning | Warning threshold for Service Bus server errors monitor | string | `50` | no |
|
| server_errors_threshold_warning | Warning threshold for Service Bus server errors monitor | string | `50` | no |
|
||||||
|
| server_errors_time_aggregator | Monitor aggregator for Service Bus server errors [available values: min, max or avg] | string | `min` | no |
|
||||||
| server_errors_timeframe | Monitor timeframe for Service Bus server errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| server_errors_timeframe | Monitor timeframe for Service Bus server errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| status_enabled | Flag to enable Service Bus status monitor | string | `true` | no |
|
| status_enabled | Flag to enable Service Bus status monitor | string | `true` | no |
|
||||||
| status_extra_tags | Extra tags for Service Bus status monitor | list | `[]` | no |
|
| status_extra_tags | Extra tags for Service Bus status monitor | list | `[]` | no |
|
||||||
@ -54,6 +55,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| user_errors_silenced | Groups to mute for Service Bus user errors monitor | map | `{}` | no |
|
| user_errors_silenced | Groups to mute for Service Bus user errors monitor | map | `{}` | no |
|
||||||
| user_errors_threshold_critical | Critical threshold for Service Bus user errors monitor | string | `90` | no |
|
| user_errors_threshold_critical | Critical threshold for Service Bus user errors monitor | string | `90` | no |
|
||||||
| user_errors_threshold_warning | Warning threshold for Service Bus user errors monitor | string | `50` | no |
|
| user_errors_threshold_warning | Warning threshold for Service Bus user errors monitor | string | `50` | no |
|
||||||
|
| user_errors_time_aggregator | Monitor aggregator for Service Bus user errors [available values: min, max or avg] | string | `min` | no |
|
||||||
| user_errors_timeframe | Monitor timeframe for Service Bus user errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| user_errors_timeframe | Monitor timeframe for Service Bus user errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
|
|
||||||
## Outputs
|
## Outputs
|
||||||
|
|||||||
@ -118,6 +118,12 @@ variable "server_errors_silenced" {
|
|||||||
default = {}
|
default = {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "server_errors_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for Service Bus server errors [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "server_errors_timeframe" {
|
variable "server_errors_timeframe" {
|
||||||
description = "Monitor timeframe for Service Bus server errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for Service Bus server errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -152,6 +158,12 @@ variable "user_errors_silenced" {
|
|||||||
default = {}
|
default = {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "user_errors_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for Service Bus user errors [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "user_errors_timeframe" {
|
variable "user_errors_timeframe" {
|
||||||
description = "Monitor timeframe for Service Bus user errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for Service Bus user errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
|
|||||||
@ -63,11 +63,10 @@ resource "datadog_monitor" "service_bus_user_errors" {
|
|||||||
message = "${coalesce(var.user_errors_message, var.message)}"
|
message = "${coalesce(var.user_errors_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.user_errors_timeframe}): (default(
|
${var.user_errors_time_aggregator}(${var.user_errors_timeframe}): (
|
||||||
avg:azure.servicebus_namespaces.user_errors.preview${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() /
|
default(avg:azure.servicebus_namespaces.user_errors.preview${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
avg:azure.servicebus_namespaces.incoming_requests_preview${module.filter-tags.query_alert} by {resource_group,region,name}.as_count()
|
default(avg:azure.servicebus_namespaces.incoming_requests_preview${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
* 100, 0)
|
) * 100 > ${var.user_errors_threshold_critical}
|
||||||
) > ${var.user_errors_threshold_critical}
|
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
@ -99,11 +98,10 @@ resource "datadog_monitor" "service_bus_server_errors" {
|
|||||||
message = "${coalesce(var.server_errors_message, var.message)}"
|
message = "${coalesce(var.server_errors_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.server_errors_timeframe}): (default(
|
${var.server_errors_time_aggregator}(${var.server_errors_timeframe}): (
|
||||||
avg:azure.servicebus_namespaces.server_errors.preview${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() /
|
default(avg:azure.servicebus_namespaces.server_errors.preview${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
avg:azure.servicebus_namespaces.incoming_requests_preview${module.filter-tags.query_alert} by {resource_group,region,name}.as_count()
|
default(avg:azure.servicebus_namespaces.incoming_requests_preview${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
* 100, 0)
|
) * 100 > ${var.server_errors_threshold_critical}
|
||||||
) > ${var.server_errors_threshold_critical}
|
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|||||||
@ -42,6 +42,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| failed_function_requests_silenced | Groups to mute for Stream Analytics failed requests monitor | map | `{}` | no |
|
| failed_function_requests_silenced | Groups to mute for Stream Analytics failed requests monitor | map | `{}` | no |
|
||||||
| failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no |
|
| failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no |
|
||||||
| failed_function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no |
|
| failed_function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no |
|
||||||
|
| failed_function_requests_time_aggregator | Monitor aggregator for Stream Analytics failed requests [available values: min, max or avg] | string | `min` | no |
|
||||||
| failed_function_requests_timeframe | Monitor timeframe for Stream Analytics failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
| failed_function_requests_timeframe | Monitor timeframe for Stream Analytics failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
|
||||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||||
| filter_tags_custom_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `` | no |
|
| filter_tags_custom_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `` | no |
|
||||||
|
|||||||
@ -141,6 +141,12 @@ variable "failed_function_requests_message" {
|
|||||||
default = ""
|
default = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "failed_function_requests_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for Stream Analytics failed requests [available values: min, max or avg]"
|
||||||
|
type = "string"
|
||||||
|
default = "min"
|
||||||
|
}
|
||||||
|
|
||||||
variable "failed_function_requests_timeframe" {
|
variable "failed_function_requests_timeframe" {
|
||||||
description = "Monitor timeframe for Stream Analytics failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
description = "Monitor timeframe for Stream Analytics failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
type = "string"
|
type = "string"
|
||||||
|
|||||||
@ -65,9 +65,9 @@ resource "datadog_monitor" "failed_function_requests" {
|
|||||||
message = "${coalesce(var.failed_function_requests_message, var.message)}"
|
message = "${coalesce(var.failed_function_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
sum(${var.failed_function_requests_timeframe}): (
|
${var.failed_function_requests_time_aggregator}(${var.failed_function_requests_timeframe}): (
|
||||||
avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() /
|
default(avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
avg:azure.streamanalytics_streamingjobs.aml_callout_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_count()
|
default(avg:azure.streamanalytics_streamingjobs.aml_callout_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
) * 100 > ${var.failed_function_requests_threshold_critical}
|
) * 100 > ${var.failed_function_requests_threshold_critical}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
|||||||
@ -50,7 +50,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| error_rate_4xx_silenced | Groups to mute for GCP LB 4XX Errors monitor | map | `{}` | no |
|
| error_rate_4xx_silenced | Groups to mute for GCP LB 4XX Errors monitor | map | `{}` | no |
|
||||||
| error_rate_4xx_threshold_critical | Rate error in percentage (critical threshold) | string | `60` | no |
|
| error_rate_4xx_threshold_critical | Rate error in percentage (critical threshold) | string | `60` | no |
|
||||||
| error_rate_4xx_threshold_warning | Rate error in percentage (warning threshold) | string | `50` | no |
|
| error_rate_4xx_threshold_warning | Rate error in percentage (warning threshold) | string | `50` | no |
|
||||||
| error_rate_4xx_time_aggregator | Timeframe for the GCP LB 4XX Errors monitor | string | `sum` | no |
|
| error_rate_4xx_time_aggregator | Timeframe for the GCP LB 4XX Errors monitor | string | `min` | no |
|
||||||
| error_rate_4xx_timeframe | Timeframe for the GCP LB 4XX Errors monitor | string | `last_5m` | no |
|
| error_rate_4xx_timeframe | Timeframe for the GCP LB 4XX Errors monitor | string | `last_5m` | no |
|
||||||
| error_rate_5xx_artificial_request | Divisor Delta for the GCP LB 5XX Errors monitor | string | `5` | no |
|
| error_rate_5xx_artificial_request | Divisor Delta for the GCP LB 5XX Errors monitor | string | `5` | no |
|
||||||
| error_rate_5xx_enabled | Flag to enable GCP LB 5XX Errors monitor | string | `true` | no |
|
| error_rate_5xx_enabled | Flag to enable GCP LB 5XX Errors monitor | string | `true` | no |
|
||||||
@ -59,7 +59,7 @@ Creates DataDog monitors with the following checks:
|
|||||||
| error_rate_5xx_silenced | Groups to mute for GCP LB 5XX Errors monitor | map | `{}` | no |
|
| error_rate_5xx_silenced | Groups to mute for GCP LB 5XX Errors monitor | map | `{}` | no |
|
||||||
| error_rate_5xx_threshold_critical | Rate error in percentage (critical threshold) | string | `40` | no |
|
| error_rate_5xx_threshold_critical | Rate error in percentage (critical threshold) | string | `40` | no |
|
||||||
| error_rate_5xx_threshold_warning | Rate error in percentage (warning threshold) | string | `30` | no |
|
| error_rate_5xx_threshold_warning | Rate error in percentage (warning threshold) | string | `30` | no |
|
||||||
| error_rate_5xx_time_aggregator | Timeframe for the GCP LB 5XX Errors monitor | string | `sum` | no |
|
| error_rate_5xx_time_aggregator | Timeframe for the GCP LB 5XX Errors monitor | string | `min` | no |
|
||||||
| error_rate_5xx_timeframe | Timeframe for the GCP LB 5XX Errors monitor | string | `last_5m` | no |
|
| error_rate_5xx_timeframe | Timeframe for the GCP LB 5XX Errors monitor | string | `last_5m` | no |
|
||||||
| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
| evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||||
| filter_tags | Tags used for filtering | string | `*` | no |
|
| filter_tags | Tags used for filtering | string | `*` | no |
|
||||||
|
|||||||
@ -37,7 +37,7 @@ variable "error_rate_4xx_message" {
|
|||||||
variable "error_rate_4xx_time_aggregator" {
|
variable "error_rate_4xx_time_aggregator" {
|
||||||
description = "Timeframe for the GCP LB 4XX Errors monitor"
|
description = "Timeframe for the GCP LB 4XX Errors monitor"
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "sum"
|
default = "min"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "error_rate_4xx_timeframe" {
|
variable "error_rate_4xx_timeframe" {
|
||||||
@ -94,7 +94,7 @@ variable "error_rate_5xx_message" {
|
|||||||
variable "error_rate_5xx_time_aggregator" {
|
variable "error_rate_5xx_time_aggregator" {
|
||||||
description = "Timeframe for the GCP LB 5XX Errors monitor"
|
description = "Timeframe for the GCP LB 5XX Errors monitor"
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "sum"
|
default = "min"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "error_rate_5xx_timeframe" {
|
variable "error_rate_5xx_timeframe" {
|
||||||
|
|||||||
@ -10,10 +10,9 @@ resource "datadog_monitor" "error_rate_4xx" {
|
|||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
${var.error_rate_4xx_time_aggregator}(${var.error_rate_4xx_timeframe}):
|
${var.error_rate_4xx_time_aggregator}(${var.error_rate_4xx_timeframe}):
|
||||||
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags},response_code_class:400} by {forwarding_rule_name}.as_count(), 0)
|
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags},response_code_class:400} by {forwarding_rule_name}.as_rate(), 0) / (
|
||||||
/ (default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags}} by {forwarding_rule_name}.as_count(), 0)
|
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags}} by {forwarding_rule_name}.as_rate() + ${var.error_rate_4xx_artificial_request}, 1))
|
||||||
+ ${var.error_rate_4xx_artificial_request}) * 100
|
* 100 > ${var.error_rate_4xx_threshold_critical}
|
||||||
> ${var.error_rate_4xx_threshold_critical}
|
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
@ -49,10 +48,9 @@ resource "datadog_monitor" "error_rate_5xx" {
|
|||||||
|
|
||||||
query = <<EOF
|
query = <<EOF
|
||||||
${var.error_rate_5xx_time_aggregator}(${var.error_rate_5xx_timeframe}):
|
${var.error_rate_5xx_time_aggregator}(${var.error_rate_5xx_timeframe}):
|
||||||
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags},response_code_class:500} by {forwarding_rule_name}.as_count(), 0)
|
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags},response_code_class:500} by {forwarding_rule_name}.as_rate(), 0) / (
|
||||||
/ (default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags}} by {forwarding_rule_name}.as_count(), 0)
|
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags}} by {forwarding_rule_name}.as_rate() + ${var.error_rate_5xx_artificial_request}, 1))
|
||||||
+ ${var.error_rate_5xx_artificial_request}) * 100
|
* 100 > ${var.error_rate_5xx_threshold_critical}
|
||||||
> ${var.error_rate_5xx_threshold_critical}
|
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user