MON-96 - Updated aws monitors with inputs best practice

This commit is contained in:
Alexandre Gaillet 2018-03-20 12:20:31 +01:00 committed by Quentin Manfroi
parent 992dfc8213
commit bb88248053
18 changed files with 278 additions and 20 deletions

View File

@ -27,17 +27,23 @@ Inputs
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| artificial_requests_count | Number of false requests used to mitigate false positive in case of low trafic | string | `5` | no |
| delay | Delay in seconds for the metric evaluation | string | `900` | no | | delay | Delay in seconds for the metric evaluation | string | `900` | no |
| environment | Environment | string | - | yes | | environment | Environment | string | - | yes |
| filter_tags | Tags used for custom filtering | string | `*` | no | | filter_tags | Tags used for filtering | string | `*` | no |
| http_4xx_requests_message | Custom message for API Gateway HTTP 4xx requests monitor | string | `` | no |
| http_4xx_requests_silenced | Groups to mute for API Gateway HTTP 4xx requests monitor | map | `<map>` | no |
| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no | | http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no |
| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no | | http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no |
| http_5xx_requests_message | Custom message for API Gateway HTTP 5xx requests monitor | string | `` | no |
| http_5xx_requests_silenced | Groups to mute for API Gateway HTTP 5xx requests monitor | map | `<map>` | no |
| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no | | http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no |
| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no | | http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no |
| latency_message | Custom message for API Gateway latency monitor | string | `` | no |
| latency_silenced | Groups to mute for API Gateway latency monitor | map | `<map>` | no |
| latency_threshold_critical | Alerting threshold in milliseconds | string | `800` | no |
| latency_threshold_warning | Warning threshold in milliseconds | string | `400` | no |
| message | Message sent when a monitor is triggered | string | - | yes | | message | Message sent when a monitor is triggered | string | - | yes |
| latency_threshold_critical | Alerting threshold in miliseconds | string | `800` | no |
| latency_threshold_warning | Warning threshold in miliseconds | string | `400` | no |
| artificial_requests_count | Number of false requests used to mitigate false positive in case of low trafic | string | `0` | no |
Related documentation Related documentation
--------------------- ---------------------

View File

@ -21,6 +21,18 @@ variable "delay" {
### LATENCY VARIABLES ### ### LATENCY VARIABLES ###
################################### ###################################
variable "latency_silenced" {
description = "Groups to mute for API Gateway latency monitor"
type = "map"
default = {}
}
variable "latency_message" {
description = "Custom message for API Gateway latency monitor"
type = "string"
default = ""
}
variable "latency_threshold_critical" { variable "latency_threshold_critical" {
default = 800 default = 800
description = "Alerting threshold in milliseconds" description = "Alerting threshold in milliseconds"
@ -35,6 +47,18 @@ variable "latency_threshold_warning" {
### HTTP 5xx status pages ### ### HTTP 5xx status pages ###
################################# #################################
variable "http_5xx_requests_silenced" {
description = "Groups to mute for API Gateway HTTP 5xx requests monitor"
type = "map"
default = {}
}
variable "http_5xx_requests_message" {
description = "Custom message for API Gateway HTTP 5xx requests monitor"
type = "string"
default = ""
}
variable "http_5xx_requests_threshold_critical" { variable "http_5xx_requests_threshold_critical" {
default = 20 default = 20
description = "Maximum critical acceptable percent of 5xx errors" description = "Maximum critical acceptable percent of 5xx errors"
@ -49,6 +73,18 @@ variable "http_5xx_requests_threshold_warning" {
### HTTP 4xx status pages ### ### HTTP 4xx status pages ###
################################# #################################
variable "http_4xx_requests_silenced" {
description = "Groups to mute for API Gateway HTTP 4xx requests monitor"
type = "map"
default = {}
}
variable "http_4xx_requests_message" {
description = "Custom message for API Gateway HTTP 4xx requests monitor"
type = "string"
default = ""
}
variable "http_4xx_requests_threshold_critical" { variable "http_4xx_requests_threshold_critical" {
default = 30 default = 30
description = "Maximum critical acceptable percent of 4xx errors" description = "Maximum critical acceptable percent of 4xx errors"

View File

@ -2,7 +2,7 @@
resource "datadog_monitor" "API_Gateway_latency" { resource "datadog_monitor" "API_Gateway_latency" {
name = "[${var.environment}] API Gateway latency {{#is_alert}}{{comparator}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" name = "[${var.environment}] API Gateway latency {{#is_alert}}{{comparator}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
type = "metric alert" type = "metric alert"
message = "${var.message}" message = "${coalesce(var.latency_message, var.message)}"
query = <<EOF query = <<EOF
avg(last_5m): ( avg(last_5m): (
@ -24,6 +24,8 @@ resource "datadog_monitor" "API_Gateway_latency" {
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
silenced = "${var.latency_silenced}"
tags = ["env:${var.environment}", "resource:apigateway", "team:aws", "provider:aws"] tags = ["env:${var.environment}", "resource:apigateway", "team:aws", "provider:aws"]
} }
@ -31,7 +33,7 @@ resource "datadog_monitor" "API_Gateway_latency" {
resource "datadog_monitor" "API_http_5xx_errors_count" { resource "datadog_monitor" "API_http_5xx_errors_count" {
name = "[${var.environment}] API Gateway HTTP 5xx errors {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "[${var.environment}] API Gateway HTTP 5xx errors {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
type = "metric alert" type = "metric alert"
message = "${var.message}" message = "${coalesce(var.http_5xx_requests_message, var.message)}"
query = <<EOF query = <<EOF
sum(last_5m): ( sum(last_5m): (
@ -54,6 +56,8 @@ resource "datadog_monitor" "API_http_5xx_errors_count" {
timeout_h = 1 timeout_h = 1
include_tags = true include_tags = true
silenced = "${var.http_5xx_requests_silenced}"
tags = ["env:${var.environment}", "resource:apigateway", "team:aws", "provider:aws"] tags = ["env:${var.environment}", "resource:apigateway", "team:aws", "provider:aws"]
} }
@ -61,7 +65,7 @@ resource "datadog_monitor" "API_http_5xx_errors_count" {
resource "datadog_monitor" "API_http_4xx_errors_count" { resource "datadog_monitor" "API_http_4xx_errors_count" {
name = "[${var.environment}] API Gateway HTTP 4xx errors {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "[${var.environment}] API Gateway HTTP 4xx errors {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
type = "metric alert" type = "metric alert"
message = "${var.message}" message = "${coalesce(var.http_4xx_requests_message, var.message)}"
query = <<EOF query = <<EOF
sum(last_5m): ( sum(last_5m): (
@ -84,5 +88,7 @@ resource "datadog_monitor" "API_http_4xx_errors_count" {
timeout_h = 1 timeout_h = 1
include_tags = true include_tags = true
silenced = "${var.http_4xx_requests_silenced}"
tags = ["env:${var.environment}", "resource:apigateway", "team:aws", "provider:aws"] tags = ["env:${var.environment}", "resource:apigateway", "team:aws", "provider:aws"]
} }

View File

@ -29,11 +29,17 @@ Inputs
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| cpu_message | Custom message for ES cluster cpu monitor | string | `` | no |
| cpu_silenced | Groups to mute for ES cluster cpu monitor | map | `<map>` | no |
| cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | | cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no |
| cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no | | cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no |
| diskspace_message | Custom message for ES cluster diskspace monitor | string | `` | no |
| diskspace_silenced | Groups to mute for ES cluster diskspace monitor | map | `<map>` | no |
| diskspace_threshold_critical | Disk free space in percent (critical threshold) | string | `10` | no | | diskspace_threshold_critical | Disk free space in percent (critical threshold) | string | `10` | no |
| diskspace_threshold_warning | Disk free space in percent (warning threshold) | string | `20` | no | | diskspace_threshold_warning | Disk free space in percent (warning threshold) | string | `20` | no |
| environment | Architecture Environment | string | - | yes | | environment | Architecture Environment | string | - | yes |
| es_cluster_status_message | Custom message for ES cluster status monitor | string | `` | no |
| es_cluster_status_silenced | Groups to mute for ES cluster status monitor | map | `<map>` | no |
| es_cluster_volume_size | ElasticSearch Domain volume size (in GB) | string | - | yes | | es_cluster_volume_size | ElasticSearch Domain volume size (in GB) | string | - | yes |
| evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no | | evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |

View File

@ -25,10 +25,35 @@ variable "filter_tags_custom" {
} }
# AWS ElasticSearch Service specific # AWS ElasticSearch Service specific
variable "es_cluster_status_silenced" {
description = "Groups to mute for ES cluster status monitor"
type = "map"
default = {}
}
variable "es_cluster_status_message" {
description = "Custom message for ES cluster status monitor"
type = "string"
default = ""
}
variable "es_cluster_volume_size" { variable "es_cluster_volume_size" {
description = "ElasticSearch Domain volume size (in GB)" description = "ElasticSearch Domain volume size (in GB)"
} }
variable "diskspace_silenced" {
description = "Groups to mute for ES cluster diskspace monitor"
type = "map"
default = {}
}
variable "diskspace_message" {
description = "Custom message for ES cluster diskspace monitor"
type = "string"
default = ""
}
variable "diskspace_threshold_warning" { variable "diskspace_threshold_warning" {
description = "Disk free space in percent (warning threshold)" description = "Disk free space in percent (warning threshold)"
default = "20" default = "20"
@ -39,6 +64,18 @@ variable "diskspace_threshold_critical" {
default = "10" default = "10"
} }
variable "cpu_silenced" {
description = "Groups to mute for ES cluster cpu monitor"
type = "map"
default = {}
}
variable "cpu_message" {
description = "Custom message for ES cluster cpu monitor"
type = "string"
default = ""
}
variable "cpu_threshold_warning" { variable "cpu_threshold_warning" {
description = "CPU usage in percent (warning threshold)" description = "CPU usage in percent (warning threshold)"
default = "80" default = "80"

View File

@ -9,7 +9,7 @@ data "template_file" "filter" {
### Elasticsearch cluster status monitor ### ### Elasticsearch cluster status monitor ###
resource "datadog_monitor" "es_cluster_status" { resource "datadog_monitor" "es_cluster_status" {
name = "[${var.environment}] ElasticSearch cluster status is not green" name = "[${var.environment}] ElasticSearch cluster status is not green"
message = "${var.message}" message = "${coalesce(var.es_cluster_status_message, var.message)}"
type = "query alert" type = "query alert"
@ -37,13 +37,15 @@ EOF
new_host_delay = "${var.evaluation_delay}" new_host_delay = "${var.evaluation_delay}"
no_data_timeframe = 20 no_data_timeframe = 20
silenced = "${var.es_cluster_status_silenced}"
tags = ["env:${var.environment}", "resource:elasticsearch", "team:aws", "provider:aws"] tags = ["env:${var.environment}", "resource:elasticsearch", "team:aws", "provider:aws"]
} }
### Elasticsearch cluster free storage space monitor ### ### Elasticsearch cluster free storage space monitor ###
resource "datadog_monitor" "es_free_space_low" { resource "datadog_monitor" "es_free_space_low" {
name = "[${var.environment}] ElasticSearch cluster free storage space {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "[${var.environment}] ElasticSearch cluster free storage space {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${var.message}" message = "${coalesce(var.diskspace_message, var.message)}"
type = "query alert" type = "query alert"
@ -70,13 +72,15 @@ EOF
new_host_delay = "${var.evaluation_delay}" new_host_delay = "${var.evaluation_delay}"
no_data_timeframe = 20 no_data_timeframe = 20
silenced = "${var.diskspace_silenced}"
tags = ["env:${var.environment}", "resource:elasticsearch", "team:aws", "provider:aws"] tags = ["env:${var.environment}", "resource:elasticsearch", "team:aws", "provider:aws"]
} }
### Elasticsearch cluster CPU monitor ### ### Elasticsearch cluster CPU monitor ###
resource "datadog_monitor" "es_cpu_90_15min" { resource "datadog_monitor" "es_cpu_90_15min" {
name = "[${var.environment}] ElasticSearch cluster CPU high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "[${var.environment}] ElasticSearch cluster CPU high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${var.message}" message = "${coalesce(var.cpu_message, var.message)}"
type = "query alert" type = "query alert"
@ -102,5 +106,7 @@ EOF
new_host_delay = "${var.evaluation_delay}" new_host_delay = "${var.evaluation_delay}"
no_data_timeframe = 20 no_data_timeframe = 20
silenced = "${var.cpu_silenced}"
tags = ["env:${var.environment}", "resource:elasticsearch", "team:aws", "provider:aws"] tags = ["env:${var.environment}", "resource:elasticsearch", "team:aws", "provider:aws"]
} }

View File

@ -31,16 +31,28 @@ Inputs
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| dd_aws_elb | # ELB | string | `disable` | no | | dd_aws_elb | # ELB | string | `disable` | no |
| elb_4xx_message | Custom message for ELB 4xx errors monitor | string | `` | no |
| elb_4xx_silenced | Groups to mute for ELB 4xx errors monitor | map | `<map>` | no |
| elb_4xx_threshold_critical | loadbalancer 4xx critical threshold in percentage | string | `10` | no | | elb_4xx_threshold_critical | loadbalancer 4xx critical threshold in percentage | string | `10` | no |
| elb_4xx_threshold_warning | loadbalancer 4xx warning threshold in percentage | string | `5` | no | | elb_4xx_threshold_warning | loadbalancer 4xx warning threshold in percentage | string | `5` | no |
| elb_5xx_message | Custom message for ELB 5xx errors monitor | string | `` | no |
| elb_5xx_silenced | Groups to mute for ELB 5xx errors monitor | map | `<map>` | no |
| elb_5xx_threshold_critical | loadbalancer 5xx critical threshold in percentage | string | `10` | no | | elb_5xx_threshold_critical | loadbalancer 5xx critical threshold in percentage | string | `10` | no |
| elb_5xx_threshold_warning | loadbalancer 5xx warning threshold in percentage | string | `5` | no | | elb_5xx_threshold_warning | loadbalancer 5xx warning threshold in percentage | string | `5` | no |
| elb_backend_4xx_message | Custom message for ELB backend 4xx errors monitor | string | `` | no |
| elb_backend_4xx_silenced | Groups to mute for ELB backend 4xx errors monitor | map | `<map>` | no |
| elb_backend_4xx_threshold_critical | loadbalancer backend 4xx critical threshold in percentage | string | `10` | no | | elb_backend_4xx_threshold_critical | loadbalancer backend 4xx critical threshold in percentage | string | `10` | no |
| elb_backend_4xx_threshold_warning | loadbalancer backend 4xx warning threshold in percentage | string | `5` | no | | elb_backend_4xx_threshold_warning | loadbalancer backend 4xx warning threshold in percentage | string | `5` | no |
| elb_backend_5xx_message | Custom message for ELB backend 5xx errors monitor | string | `` | no |
| elb_backend_5xx_silenced | Groups to mute for ELB backend 5xx errors monitor | map | `<map>` | no |
| elb_backend_5xx_threshold_critical | loadbalancer backend 5xx critical threshold in percentage | string | `10` | no | | elb_backend_5xx_threshold_critical | loadbalancer backend 5xx critical threshold in percentage | string | `10` | no |
| elb_backend_5xx_threshold_warning | loadbalancer backend 5xx warning threshold in percentage | string | `5` | no | | elb_backend_5xx_threshold_warning | loadbalancer backend 5xx warning threshold in percentage | string | `5` | no |
| elb_backend_latency_critical | latency critical threshold in seconds | string | `5` | no | | elb_backend_latency_critical | latency critical threshold in seconds | string | `5` | no |
| elb_backend_latency_message | Custom message for ELB backend latency monitor | string | `` | no |
| elb_backend_latency_silenced | Groups to mute for ELB backend latency monitor | map | `<map>` | no |
| elb_backend_latency_warning | latency warning threshold in seconds | string | `1` | no | | elb_backend_latency_warning | latency warning threshold in seconds | string | `1` | no |
| elb_no_healthy_instance_message | Custom message for ELB no healty instance monitor | string | `` | no |
| elb_no_healthy_instance_silenced | Groups to mute for ELB no healty instance monitor | map | `<map>` | no |
| environment | Architecture Environment | string | - | yes | | environment | Architecture Environment | string | - | yes |
| evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no | | evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |

View File

@ -29,6 +29,30 @@ variable "dd_aws_elb" {
default = "disable" default = "disable"
} }
variable "elb_no_healthy_instance_silenced" {
description = "Groups to mute for ELB no healty instance monitor"
type = "map"
default = {}
}
variable "elb_no_healthy_instance_message" {
description = "Custom message for ELB no healty instance monitor"
type = "string"
default = ""
}
variable "elb_4xx_silenced" {
description = "Groups to mute for ELB 4xx errors monitor"
type = "map"
default = {}
}
variable "elb_4xx_message" {
description = "Custom message for ELB 4xx errors monitor"
type = "string"
default = ""
}
variable "elb_4xx_threshold_warning" { variable "elb_4xx_threshold_warning" {
description = "loadbalancer 4xx warning threshold in percentage" description = "loadbalancer 4xx warning threshold in percentage"
default = 5 default = 5
@ -39,6 +63,18 @@ variable "elb_4xx_threshold_critical" {
default = 10 default = 10
} }
variable "elb_5xx_silenced" {
description = "Groups to mute for ELB 5xx errors monitor"
type = "map"
default = {}
}
variable "elb_5xx_message" {
description = "Custom message for ELB 5xx errors monitor"
type = "string"
default = ""
}
variable "elb_5xx_threshold_warning" { variable "elb_5xx_threshold_warning" {
description = "loadbalancer 5xx warning threshold in percentage" description = "loadbalancer 5xx warning threshold in percentage"
default = 5 default = 5
@ -49,6 +85,18 @@ variable "elb_5xx_threshold_critical" {
default = 10 default = 10
} }
variable "elb_backend_4xx_silenced" {
description = "Groups to mute for ELB backend 4xx errors monitor"
type = "map"
default = {}
}
variable "elb_backend_4xx_message" {
description = "Custom message for ELB backend 4xx errors monitor"
type = "string"
default = ""
}
variable "elb_backend_4xx_threshold_warning" { variable "elb_backend_4xx_threshold_warning" {
description = "loadbalancer backend 4xx warning threshold in percentage" description = "loadbalancer backend 4xx warning threshold in percentage"
default = 5 default = 5
@ -59,6 +107,18 @@ variable "elb_backend_4xx_threshold_critical" {
default = 10 default = 10
} }
variable "elb_backend_5xx_silenced" {
description = "Groups to mute for ELB backend 5xx errors monitor"
type = "map"
default = {}
}
variable "elb_backend_5xx_message" {
description = "Custom message for ELB backend 5xx errors monitor"
type = "string"
default = ""
}
variable "elb_backend_5xx_threshold_warning" { variable "elb_backend_5xx_threshold_warning" {
description = "loadbalancer backend 5xx warning threshold in percentage" description = "loadbalancer backend 5xx warning threshold in percentage"
default = 5 default = 5
@ -69,6 +129,18 @@ variable "elb_backend_5xx_threshold_critical" {
default = 10 default = 10
} }
variable "elb_backend_latency_silenced" {
description = "Groups to mute for ELB backend latency monitor"
type = "map"
default = {}
}
variable "elb_backend_latency_message" {
description = "Custom message for ELB backend latency monitor"
type = "string"
default = ""
}
variable "elb_backend_latency_warning" { variable "elb_backend_latency_warning" {
description = "latency warning threshold in seconds" description = "latency warning threshold in seconds"
default = 1 default = 1

View File

@ -8,7 +8,7 @@ data "template_file" "filter" {
resource "datadog_monitor" "ELB_no_healthy_instances" { resource "datadog_monitor" "ELB_no_healthy_instances" {
name = "[${var.environment}] ELB no healthy instances" name = "[${var.environment}] ELB no healthy instances"
message = "${var.message}" message = "${coalesce(var.elb_no_healthy_instance_message, var.message)}"
query = <<EOF query = <<EOF
avg(last_5m): ( avg(last_5m): (
@ -29,12 +29,14 @@ resource "datadog_monitor" "ELB_no_healthy_instances" {
new_host_delay = "${var.evaluation_delay}" new_host_delay = "${var.evaluation_delay}"
no_data_timeframe = 20 no_data_timeframe = 20
silenced = "${var.elb_no_healthy_instance_silenced}"
tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"] tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"]
} }
resource "datadog_monitor" "ELB_too_much_4xx" { resource "datadog_monitor" "ELB_too_much_4xx" {
name = "[${var.environment}] ELB 4xx errors too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "[${var.environment}] ELB 4xx errors too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${var.message}" message = "${coalesce(var.elb_4xx_message, var.message)}"
query = <<EOF query = <<EOF
avg(last_5m): ( avg(last_5m): (
@ -61,12 +63,14 @@ resource "datadog_monitor" "ELB_too_much_4xx" {
new_host_delay = "${var.evaluation_delay}" new_host_delay = "${var.evaluation_delay}"
no_data_timeframe = 20 no_data_timeframe = 20
silenced = "${var.elb_4xx_silenced}"
tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"] tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"]
} }
resource "datadog_monitor" "ELB_too_much_5xx" { resource "datadog_monitor" "ELB_too_much_5xx" {
name = "[${var.environment}] ELB 5xx errors too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "[${var.environment}] ELB 5xx errors too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${var.message}" message = "${coalesce(var.elb_5xx_message, var.message)}"
query = <<EOF query = <<EOF
avg(last_5m): ( avg(last_5m): (
@ -93,12 +97,14 @@ resource "datadog_monitor" "ELB_too_much_5xx" {
new_host_delay = "${var.evaluation_delay}" new_host_delay = "${var.evaluation_delay}"
no_data_timeframe = 20 no_data_timeframe = 20
silenced = "${var.elb_5xx_silenced}"
tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"] tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"]
} }
resource "datadog_monitor" "ELB_too_much_4xx_backend" { resource "datadog_monitor" "ELB_too_much_4xx_backend" {
name = "[${var.environment}] ELB backend 4xx errors too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "[${var.environment}] ELB backend 4xx errors too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${var.message}" message = "${coalesce(var.elb_backend_4xx_message, var.message)}"
query = <<EOF query = <<EOF
avg(last_5m): ( avg(last_5m): (
@ -125,12 +131,14 @@ resource "datadog_monitor" "ELB_too_much_4xx_backend" {
new_host_delay = "${var.evaluation_delay}" new_host_delay = "${var.evaluation_delay}"
no_data_timeframe = 20 no_data_timeframe = 20
silenced = "${var.elb_backend_4xx_silenced}"
tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"] tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"]
} }
resource "datadog_monitor" "ELB_too_much_5xx_backend" { resource "datadog_monitor" "ELB_too_much_5xx_backend" {
name = "[${var.environment}] ELB backend 5xx errors too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "[${var.environment}] ELB backend 5xx errors too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${var.message}" message = "${coalesce(var.elb_backend_5xx_message, var.message)}"
query = <<EOF query = <<EOF
avg(last_5m): ( avg(last_5m): (
@ -157,12 +165,14 @@ resource "datadog_monitor" "ELB_too_much_5xx_backend" {
new_host_delay = "${var.evaluation_delay}" new_host_delay = "${var.evaluation_delay}"
no_data_timeframe = 20 no_data_timeframe = 20
silenced = "${var.elb_backend_5xx_silenced}"
tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"] tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"]
} }
resource "datadog_monitor" "ELB_backend_latency" { resource "datadog_monitor" "ELB_backend_latency" {
name = "[${var.environment}] ELB latency too high {{#is_alert}}{{comparator}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}s ({{value}}s){{/is_warning}}" name = "[${var.environment}] ELB latency too high {{#is_alert}}{{comparator}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
message = "${var.message}" message = "${coalesce(var.elb_backend_latency_message, var.message)}"
query = <<EOF query = <<EOF
min(last_5m): ( min(last_5m): (
@ -188,5 +198,7 @@ resource "datadog_monitor" "ELB_backend_latency" {
new_host_delay = "${var.evaluation_delay}" new_host_delay = "${var.evaluation_delay}"
no_data_timeframe = 20 no_data_timeframe = 20
silenced = "${var.elb_backend_latency_silenced}"
tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"] tags = ["env:${var.environment}", "resource:elb", "team:aws", "provider:aws"]
} }

View File

@ -28,6 +28,8 @@ Inputs
| environment | Environment | string | - | yes | | environment | Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| incoming_records_message | Custom message for Kinesis Firehorse incoming records monitor | string | `` | no |
| incoming_records_silenced | Groups to mute for Kinesis Firehorse incoming records monitor | map | `<map>` | no |
| incoming_records_timeframe | Monitor timeframe for incoming records metrics evaluation [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | | incoming_records_timeframe | Monitor timeframe for incoming records metrics evaluation [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no |
| message | Message sent when an alert is triggered | string | - | yes | | message | Message sent when an alert is triggered | string | - | yes |

View File

@ -25,6 +25,19 @@ variable "filter_tags_custom" {
} }
# Kinesis-Firehose # Kinesis-Firehose
variable "incoming_records_silenced" {
description = "Groups to mute for Kinesis Firehorse incoming records monitor"
type = "map"
default = {}
}
variable "incoming_records_message" {
description = "Custom message for Kinesis Firehorse incoming records monitor"
type = "string"
default = ""
}
variable "incoming_records_timeframe" { variable "incoming_records_timeframe" {
description = "Monitor timeframe for incoming records metrics evaluation [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for incoming records metrics evaluation [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
default = "last_15m" default = "last_15m"

View File

@ -9,7 +9,7 @@ data "template_file" "filter" {
### Kinesis Firehose Incoming records ### ### Kinesis Firehose Incoming records ###
resource "datadog_monitor" "firehose_incoming_records" { resource "datadog_monitor" "firehose_incoming_records" {
name = "[${var.environment}] Kinesis Firehose No incoming records" name = "[${var.environment}] Kinesis Firehose No incoming records"
message = "${var.message}" message = "${coalesce(var.incoming_records_message, var.message)}"
type = "metric alert" type = "metric alert"
@ -34,5 +34,7 @@ EOF
new_host_delay = "${var.delay}" new_host_delay = "${var.delay}"
no_data_timeframe = 20 no_data_timeframe = 20
silenced = "${var.incoming_records_silenced}"
tags = ["env:${var.environment}", "resource:kinesis-firehose", "team:aws", "provider:aws"] tags = ["env:${var.environment}", "resource:kinesis-firehose", "team:aws", "provider:aws"]
} }

View File

@ -25,8 +25,12 @@ Inputs
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| cpu_message | Custom message for RDS CPU usage monitor | string | `` | no |
| cpu_silenced | Groups to mute for RDS CPU usage monitor | map | `<map>` | no |
| cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | | cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no |
| cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no | | cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no |
| diskspace_message | Custom message for RDS free diskspace monitor | string | `` | no |
| diskspace_silenced | Groups to mute for RDS free diskspace monitor | map | `<map>` | no |
| diskspace_threshold_critical | Disk free space in percent (critical threshold) | string | `10` | no | | diskspace_threshold_critical | Disk free space in percent (critical threshold) | string | `10` | no |
| diskspace_threshold_warning | Disk free space in percent (warning threshold) | string | `20` | no | | diskspace_threshold_warning | Disk free space in percent (warning threshold) | string | `20` | no |
| environment | Architecture Environment | string | - | yes | | environment | Architecture Environment | string | - | yes |

View File

@ -26,6 +26,18 @@ variable "filter_tags_custom" {
# AWS RDS instance specific # AWS RDS instance specific
variable "cpu_silenced" {
description = "Groups to mute for RDS CPU usage monitor"
type = "map"
default = {}
}
variable "cpu_message" {
description = "Custom message for RDS CPU usage monitor"
type = "string"
default = ""
}
variable "cpu_threshold_warning" { variable "cpu_threshold_warning" {
description = "CPU usage in percent (warning threshold)" description = "CPU usage in percent (warning threshold)"
default = "80" default = "80"
@ -36,6 +48,18 @@ variable "cpu_threshold_critical" {
default = "90" default = "90"
} }
variable "diskspace_silenced" {
description = "Groups to mute for RDS free diskspace monitor"
type = "map"
default = {}
}
variable "diskspace_message" {
description = "Custom message for RDS free diskspace monitor"
type = "string"
default = ""
}
variable "diskspace_threshold_warning" { variable "diskspace_threshold_warning" {
description = "Disk free space in percent (warning threshold)" description = "Disk free space in percent (warning threshold)"
default = "20" default = "20"

View File

@ -9,7 +9,7 @@ data "template_file" "filter" {
### RDS instance CPU monitor ### ### RDS instance CPU monitor ###
resource "datadog_monitor" "rds_cpu_90_15min" { resource "datadog_monitor" "rds_cpu_90_15min" {
name = "[${var.environment}] RDS instance CPU high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "[${var.environment}] RDS instance CPU high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${var.message}" message = "${coalesce(var.cpu_message, var.message)}"
type = "metric alert" type = "metric alert"
@ -34,13 +34,15 @@ EOF
new_host_delay = "${var.evaluation_delay}" new_host_delay = "${var.evaluation_delay}"
no_data_timeframe = 20 no_data_timeframe = 20
silenced = "${var.cpu_silenced}"
tags = ["env:${var.environment}", "resource:rds", "team:aws", "provider:aws"] tags = ["env:${var.environment}", "resource:rds", "team:aws", "provider:aws"]
} }
### RDS instance free space monitor ### ### RDS instance free space monitor ###
resource "datadog_monitor" "rds_free_space_low" { resource "datadog_monitor" "rds_free_space_low" {
name = "[${var.environment}] RDS instance free space {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "[${var.environment}] RDS instance free space {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${var.message}" message = "${coalesce(var.diskspace_message, var.message)}"
type = "metric alert" type = "metric alert"
@ -66,5 +68,7 @@ EOF
new_host_delay = "${var.evaluation_delay}" new_host_delay = "${var.evaluation_delay}"
no_data_timeframe = 20 no_data_timeframe = 20
silenced = "${var.diskspace_silenced}"
tags = ["env:${var.environment}", "resource:rds", "team:aws", "provider:aws"] tags = ["env:${var.environment}", "resource:rds", "team:aws", "provider:aws"]
} }

View File

@ -29,3 +29,5 @@ Inputs
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | | filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | | filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| message | Message sent when an alert is triggered | string | - | yes | | message | Message sent when an alert is triggered | string | - | yes |
| vpn_status_message | Custom message for VPN status monitor | string | `` | no |
| vpn_status_silenced | Groups to mute for VPN status monitor | map | `<map>` | no |

View File

@ -23,3 +23,15 @@ variable "filter_tags_custom" {
description = "Tags used for custom filtering when filter_tags_use_defaults is false" description = "Tags used for custom filtering when filter_tags_use_defaults is false"
default = "*" default = "*"
} }
variable "vpn_status_silenced" {
description = "Groups to mute for VPN status monitor"
type = "map"
default = {}
}
variable "vpn_status_message" {
description = "Custom message for VPN status monitor"
type = "string"
default = ""
}

View File

@ -8,7 +8,7 @@ data "template_file" "filter" {
resource "datadog_monitor" "VPN_status" { resource "datadog_monitor" "VPN_status" {
name = "[${var.environment}] VPN Down" name = "[${var.environment}] VPN Down"
message = "${var.message}" message = "${coalesce(var.vpn_status_message, var.message)}"
query = <<EOF query = <<EOF
avg(last_5m): ( avg(last_5m): (
@ -27,5 +27,7 @@ resource "datadog_monitor" "VPN_status" {
include_tags = true include_tags = true
require_full_window = false require_full_window = false
silenced = "${var.vpn_status_silenced}"
tags = ["env: ${var.environment}", "resource:vpn", "team:aws", "provider:aws"] tags = ["env: ${var.environment}", "resource:vpn", "team:aws", "provider:aws"]
} }