Merged in MON-96_best_patrices_updating (pull request #75)
MON-96 best patrices updating Approved-by: Quentin Manfroi <quentin.manfroi@yahoo.fr> Approved-by: Alexandre Gaillet <alexandre.gaillet@fr.clara.net> Approved-by: Boris Rousseau <boris.rousseau@morea.fr> Approved-by: Jérôme Respaut <shr3ps@gmail.com>
This commit is contained in:
commit
e82438ec91
@ -32,7 +32,7 @@ Inputs
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| alb_no_healthy_instances_message | Custom message for ALB no healthy instances monitor | string | `` | no |
|
||||
| alb_no_healthy_instances_silenced | Groups to mute for ALB no healthy instances monitor | map | `<map>` | no |
|
||||
| artificial_requests_count | Number of false requests used to mitigate false positive in case of low trafic | string | `0` | no |
|
||||
| artificial_requests_count | Number of false requests used to mitigate false positive in case of low trafic | string | `5` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||
| environment | Architecture environment | string | - | yes |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
|
||||
@ -15,7 +15,7 @@ resource "datadog_monitor" "ALB_no_healthy_instances" {
|
||||
|
||||
query = <<EOF
|
||||
min(last_1m): (
|
||||
sum:aws.applicationelb.healthy_host_count{${data.template_file.filter.rendered}} by {region,loadbalancer}
|
||||
min:aws.applicationelb.healthy_host_count{${data.template_file.filter.rendered}} by {region,loadbalancer}
|
||||
) <= 0
|
||||
EOF
|
||||
|
||||
@ -38,13 +38,13 @@ resource "datadog_monitor" "ALB_no_healthy_instances" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "ALB_latency" {
|
||||
name = "[${var.environment}] ALB latency {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] ALB latency {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = "${coalesce(var.latency_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
min(last_5m): (
|
||||
avg:aws.applicationelb.target_response_time.average{${data.template_file.filter.rendered}} by {region,loadbalancer}
|
||||
min:aws.applicationelb.target_response_time.average{${data.template_file.filter.rendered}} by {region,loadbalancer}
|
||||
) > ${var.latency_threshold_critical}
|
||||
EOF
|
||||
|
||||
@ -68,15 +68,15 @@ resource "datadog_monitor" "ALB_latency" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "ALB_httpcode_elb_5xx" {
|
||||
name = "[${var.environment}] ALB HTTP code 5xx {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] ALB HTTP code 5xx {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = "${coalesce(var.httpcode_elb_5xx_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
sum(last_5m): (
|
||||
min(last_5m): (
|
||||
default(
|
||||
avg:aws.applicationelb.httpcode_elb_5xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
|
||||
(avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
|
||||
min:aws.applicationelb.httpcode_elb_5xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
|
||||
(min:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
|
||||
0) * 100
|
||||
) > ${var.httpcode_elb_5xx_threshold_critical}
|
||||
EOF
|
||||
@ -101,15 +101,15 @@ resource "datadog_monitor" "ALB_httpcode_elb_5xx" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "ALB_httpcode_elb_4xx" {
|
||||
name = "[${var.environment}] ALB HTTP code 4xx {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] ALB HTTP code 4xx {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = "${coalesce(var.httpcode_elb_4xx_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
sum(last_5m): (
|
||||
min(last_5m): (
|
||||
default(
|
||||
avg:aws.applicationelb.httpcode_elb_4xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
|
||||
(avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
|
||||
min:aws.applicationelb.httpcode_elb_4xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
|
||||
(min:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
|
||||
0) * 100
|
||||
) > ${var.httpcode_elb_4xx_threshold_critical}
|
||||
EOF
|
||||
@ -134,15 +134,15 @@ resource "datadog_monitor" "ALB_httpcode_elb_4xx" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "ALB_httpcode_target_5xx" {
|
||||
name = "[${var.environment}] ALB target HTTP code 5xx {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] ALB target HTTP code 5xx {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = "${coalesce(var.httpcode_target_5xx_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
sum(last_5m): (
|
||||
min(last_5m): (
|
||||
default(
|
||||
avg:aws.applicationelb.httpcode_target_5xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
|
||||
(avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
|
||||
min:aws.applicationelb.httpcode_target_5xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
|
||||
(min:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
|
||||
0) * 100
|
||||
) > ${var.httpcode_target_5xx_threshold_critical}
|
||||
EOF
|
||||
@ -167,15 +167,15 @@ resource "datadog_monitor" "ALB_httpcode_target_5xx" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "ALB_httpcode_target_4xx" {
|
||||
name = "[${var.environment}] ALB target HTTP code 4xx {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] ALB target HTTP code 4xx {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = "${coalesce(var.httpcode_target_4xx_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
sum(last_5m): (
|
||||
min(last_5m): (
|
||||
default(
|
||||
avg:aws.applicationelb.httpcode_target_4xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
|
||||
(avg:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
|
||||
min:aws.applicationelb.httpcode_target_4xx{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() /
|
||||
(min:aws.applicationelb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancer}.as_count() + ${var.artificial_requests_count}),
|
||||
0) * 100
|
||||
) > ${var.httpcode_target_4xx_threshold_critical}
|
||||
EOF
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
# Monitoring Api Gateway latency
|
||||
resource "datadog_monitor" "API_Gateway_latency" {
|
||||
name = "[${var.environment}] API Gateway latency {{#is_alert}}{{comparator}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
|
||||
name = "[${var.environment}] API Gateway latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = "${coalesce(var.latency_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
avg(last_5m): (
|
||||
avg:aws.apigateway.latency{${var.filter_tags}} by {region,apiname}
|
||||
min(last_5m): (
|
||||
min:aws.apigateway.latency{${var.filter_tags}} by {region,apiname}
|
||||
) > ${var.latency_threshold_critical}
|
||||
EOF
|
||||
|
||||
@ -31,15 +31,15 @@ resource "datadog_monitor" "API_Gateway_latency" {
|
||||
|
||||
# Monitoring API Gateway 5xx errors percent
|
||||
resource "datadog_monitor" "API_http_5xx_errors_count" {
|
||||
name = "[${var.environment}] API Gateway HTTP 5xx errors {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] API Gateway HTTP 5xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = "${coalesce(var.http_5xx_requests_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
sum(last_5m): (
|
||||
min(last_5m): (
|
||||
default(
|
||||
avg:aws.apigateway.5xxerror{${var.filter_tags}} by {region,apiname}.as_count() /
|
||||
(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname}.as_count() + ${var.artificial_requests_count}),
|
||||
min:aws.apigateway.5xxerror{${var.filter_tags}} by {region,apiname}.as_count() /
|
||||
(min:aws.apigateway.count{${var.filter_tags}} by {region,apiname}.as_count() + ${var.artificial_requests_count}),
|
||||
0) * 100
|
||||
) > ${var.http_5xx_requests_threshold_critical}
|
||||
EOF
|
||||
@ -65,15 +65,15 @@ resource "datadog_monitor" "API_http_5xx_errors_count" {
|
||||
|
||||
# Monitoring API Gateway 4xx errors percent
|
||||
resource "datadog_monitor" "API_http_4xx_errors_count" {
|
||||
name = "[${var.environment}] API Gateway HTTP 4xx errors {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] API Gateway HTTP 4xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = "${coalesce(var.http_4xx_requests_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
sum(last_5m): (
|
||||
min(last_5m): (
|
||||
default(
|
||||
avg:aws.apigateway.4xxerror{${var.filter_tags}} by {region,apiname}.as_count() /
|
||||
(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname}.as_count() + ${var.artificial_requests_count}),
|
||||
min:aws.apigateway.4xxerror{${var.filter_tags}} by {region,apiname}.as_count() /
|
||||
(min:aws.apigateway.count{${var.filter_tags}} by {region,apiname}.as_count() + ${var.artificial_requests_count}),
|
||||
0) * 100
|
||||
) > ${var.http_4xx_requests_threshold_critical}
|
||||
EOF
|
||||
|
||||
@ -33,7 +33,7 @@ Inputs
|
||||
| cpu_silenced | Groups to mute for ES cluster cpu monitor | map | `<map>` | no |
|
||||
| cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no |
|
||||
| cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||
| diskspace_message | Custom message for ES cluster diskspace monitor | string | `` | no |
|
||||
| diskspace_silenced | Groups to mute for ES cluster diskspace monitor | map | `<map>` | no |
|
||||
| diskspace_threshold_critical | Disk free space in percent (critical threshold) | string | `10` | no |
|
||||
|
||||
@ -7,7 +7,7 @@ variable "environment" {
|
||||
# Global DataDog
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
|
||||
@ -47,7 +47,7 @@ EOF
|
||||
|
||||
### Elasticsearch cluster free storage space monitor ###
|
||||
resource "datadog_monitor" "es_free_space_low" {
|
||||
name = "[${var.environment}] ElasticSearch cluster free storage space {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] ElasticSearch cluster free storage space {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.diskspace_message, var.message)}"
|
||||
|
||||
type = "metric alert"
|
||||
@ -82,7 +82,7 @@ EOF
|
||||
|
||||
### Elasticsearch cluster CPU monitor ###
|
||||
resource "datadog_monitor" "es_cpu_90_15min" {
|
||||
name = "[${var.environment}] ElasticSearch cluster CPU high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] ElasticSearch cluster CPU high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.cpu_message, var.message)}"
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
@ -31,6 +31,7 @@ Inputs
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| dd_aws_elb | # ELB | string | `disable` | no |
|
||||
| artificial_requests_count | Number of false requests used to mitigate false positive in case of low trafic | string | `5` | no |
|
||||
| elb_4xx_message | Custom message for ELB 4xx errors monitor | string | `` | no |
|
||||
| elb_4xx_silenced | Groups to mute for ELB 4xx errors monitor | map | `<map>` | no |
|
||||
| elb_4xx_threshold_critical | loadbalancer 4xx critical threshold in percentage | string | `10` | no |
|
||||
@ -54,7 +55,7 @@ Inputs
|
||||
| elb_no_healthy_instance_message | Custom message for ELB no healty instance monitor | string | `` | no |
|
||||
| elb_no_healthy_instance_silenced | Groups to mute for ELB no healty instance monitor | map | `<map>` | no |
|
||||
| environment | Architecture Environment | string | - | yes |
|
||||
| evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| message | Message sent when an alert is triggered | string | - | yes |
|
||||
| message | Message sent when an alert is triggered | string | - | yes |
|
||||
|
||||
@ -5,9 +5,9 @@ variable "environment" {
|
||||
}
|
||||
|
||||
# Global DataDog
|
||||
variable "evaluation_delay" {
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
@ -150,3 +150,8 @@ variable "elb_backend_latency_critical" {
|
||||
description = "latency critical threshold in seconds"
|
||||
default = 5
|
||||
}
|
||||
|
||||
variable "artificial_requests_count" {
|
||||
default = 5
|
||||
description = "Number of false requests used to mitigate false positive in case of low trafic"
|
||||
}
|
||||
|
||||
@ -11,22 +11,22 @@ resource "datadog_monitor" "ELB_no_healthy_instances" {
|
||||
message = "${coalesce(var.elb_no_healthy_instance_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
avg(last_5m): (
|
||||
avg:aws.elb.healthy_host_count{${data.template_file.filter.rendered}} by {region,loadbalancername}
|
||||
min(last_5m): (
|
||||
min:aws.elb.healthy_host_count{${data.template_file.filter.rendered}} by {region,loadbalancername}
|
||||
) < 1
|
||||
EOF
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
silenced = "${var.elb_no_healthy_instance_silenced}"
|
||||
@ -35,14 +35,14 @@ resource "datadog_monitor" "ELB_no_healthy_instances" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "ELB_too_much_4xx" {
|
||||
name = "[${var.environment}] ELB 4xx errors too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] ELB 4xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.elb_4xx_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
avg(last_5m): (
|
||||
min(last_5m): (
|
||||
default(
|
||||
avg:aws.elb.httpcode_elb_4xx{${data.template_file.filter.rendered}} by {region,loadbalancername} /
|
||||
avg:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername},
|
||||
min:aws.elb.httpcode_elb_4xx{${data.template_file.filter.rendered}} by {region,loadbalancername}.as_count() /
|
||||
(min:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername}.as_count() + ${var.artificial_requests_count}),
|
||||
0) * 100
|
||||
) > ${var.elb_4xx_threshold_critical}
|
||||
EOF
|
||||
@ -55,14 +55,14 @@ resource "datadog_monitor" "ELB_too_much_4xx" {
|
||||
}
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
silenced = "${var.elb_4xx_silenced}"
|
||||
@ -71,14 +71,14 @@ resource "datadog_monitor" "ELB_too_much_4xx" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "ELB_too_much_5xx" {
|
||||
name = "[${var.environment}] ELB 5xx errors too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] ELB 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.elb_5xx_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
avg(last_5m): (
|
||||
min(last_5m): (
|
||||
default(
|
||||
avg:aws.elb.httpcode_elb_5xx{${data.template_file.filter.rendered}} by {region,loadbalancername} /
|
||||
avg:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername},
|
||||
min:aws.elb.httpcode_elb_5xx{${data.template_file.filter.rendered}} by {region,loadbalancername} /
|
||||
(min:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername} + ${var.artificial_requests_count}),
|
||||
0) * 100
|
||||
) > ${var.elb_5xx_threshold_critical}
|
||||
EOF
|
||||
@ -91,14 +91,14 @@ resource "datadog_monitor" "ELB_too_much_5xx" {
|
||||
}
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
silenced = "${var.elb_5xx_silenced}"
|
||||
@ -107,14 +107,14 @@ resource "datadog_monitor" "ELB_too_much_5xx" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "ELB_too_much_4xx_backend" {
|
||||
name = "[${var.environment}] ELB backend 4xx errors too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] ELB backend 4xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.elb_backend_4xx_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
avg(last_5m): (
|
||||
min(last_5m): (
|
||||
default(
|
||||
avg:aws.elb.httpcode_backend_4xx{${data.template_file.filter.rendered}} by {region,loadbalancername} /
|
||||
avg:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername},
|
||||
min:aws.elb.httpcode_backend_4xx{${data.template_file.filter.rendered}} by {region,loadbalancername} /
|
||||
(min:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername} + ${var.artificial_requests_count}),
|
||||
0) * 100
|
||||
) > ${var.elb_backend_4xx_threshold_critical}
|
||||
EOF
|
||||
@ -127,14 +127,14 @@ resource "datadog_monitor" "ELB_too_much_4xx_backend" {
|
||||
}
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
silenced = "${var.elb_backend_4xx_silenced}"
|
||||
@ -143,14 +143,14 @@ resource "datadog_monitor" "ELB_too_much_4xx_backend" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "ELB_too_much_5xx_backend" {
|
||||
name = "[${var.environment}] ELB backend 5xx errors too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] ELB backend 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.elb_backend_5xx_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
avg(last_5m): (
|
||||
min(last_5m): (
|
||||
default(
|
||||
avg:aws.elb.httpcode_backend_5xx{${data.template_file.filter.rendered}} by {region,loadbalancername} /
|
||||
avg:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername},
|
||||
min:aws.elb.httpcode_backend_5xx{${data.template_file.filter.rendered}} by {region,loadbalancername} /
|
||||
(min:aws.elb.request_count{${data.template_file.filter.rendered}} by {region,loadbalancername} + ${var.artificial_requests_count}),
|
||||
0) * 100
|
||||
) > ${var.elb_backend_5xx_threshold_critical}
|
||||
EOF
|
||||
@ -163,14 +163,14 @@ resource "datadog_monitor" "ELB_too_much_5xx_backend" {
|
||||
}
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
silenced = "${var.elb_backend_5xx_silenced}"
|
||||
@ -179,12 +179,12 @@ resource "datadog_monitor" "ELB_too_much_5xx_backend" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "ELB_backend_latency" {
|
||||
name = "[${var.environment}] ELB latency too high {{#is_alert}}{{comparator}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
|
||||
name = "[${var.environment}] ELB latency too high {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
|
||||
message = "${coalesce(var.elb_backend_latency_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
min(last_5m): (
|
||||
avg:aws.elb.latency{${data.template_file.filter.rendered}} by {region,loadbalancername}
|
||||
min:aws.elb.latency{${data.template_file.filter.rendered}} by {region,loadbalancername}
|
||||
) > ${var.elb_backend_latency_critical}
|
||||
EOF
|
||||
|
||||
@ -196,14 +196,14 @@ resource "datadog_monitor" "ELB_backend_latency" {
|
||||
}
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
silenced = "${var.elb_backend_latency_silenced}"
|
||||
|
||||
@ -34,7 +34,7 @@ Inputs
|
||||
| diskspace_threshold_critical | Disk free space in percent (critical threshold) | string | `10` | no |
|
||||
| diskspace_threshold_warning | Disk free space in percent (warning threshold) | string | `20` | no |
|
||||
| environment | Architecture Environment | string | - | yes |
|
||||
| evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| message | Message sent when an alert is triggered | string | - | yes |
|
||||
|
||||
@ -5,9 +5,9 @@ variable "environment" {
|
||||
}
|
||||
|
||||
# Global DataDog
|
||||
variable "evaluation_delay" {
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
|
||||
@ -8,7 +8,7 @@ data "template_file" "filter" {
|
||||
|
||||
### RDS instance CPU monitor ###
|
||||
resource "datadog_monitor" "rds_cpu_90_15min" {
|
||||
name = "[${var.environment}] RDS instance CPU high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] RDS instance CPU high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.cpu_message, var.message)}"
|
||||
|
||||
type = "metric alert"
|
||||
@ -25,13 +25,13 @@ EOF
|
||||
}
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
silenced = "${var.cpu_silenced}"
|
||||
@ -41,7 +41,7 @@ EOF
|
||||
|
||||
### RDS instance free space monitor ###
|
||||
resource "datadog_monitor" "rds_free_space_low" {
|
||||
name = "[${var.environment}] RDS instance free space {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] RDS instance free space {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.diskspace_message, var.message)}"
|
||||
|
||||
type = "metric alert"
|
||||
@ -59,13 +59,13 @@ EOF
|
||||
}
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
silenced = "${var.diskspace_silenced}"
|
||||
|
||||
@ -25,7 +25,7 @@ Inputs
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| environment | Architecture Environment | string | - | yes |
|
||||
| evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| message | Message sent when an alert is triggered | string | - | yes |
|
||||
|
||||
@ -5,9 +5,9 @@ variable "environment" {
|
||||
}
|
||||
|
||||
# Global DataDog
|
||||
variable "evaluation_delay" {
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
|
||||
@ -20,8 +20,8 @@ resource "datadog_monitor" "VPN_status" {
|
||||
|
||||
notify_no_data = true
|
||||
renotify_interval = 0
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
|
||||
@ -69,7 +69,7 @@ Inputs
|
||||
| appservices_response_time_silenced | Groups to mute for App Services response time monitor | map | `<map>` | no |
|
||||
| appservices_response_time_threshold_critical | Alerting threshold for response time in seconds | string | `10` | no |
|
||||
| appservices_response_time_threshold_warning | Warning threshold for response time in seconds | string | `5` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||
| environment | Architecture environment | string | - | yes |
|
||||
| eventhub_errors_rate_message | Custom message for Event Hub errors monitor | string | `` | no |
|
||||
| eventhub_errors_rate_silenced | Groups to mute for Event Hub errors monitor | map | `<map>` | no |
|
||||
|
||||
@ -27,7 +27,7 @@ Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||
| environment | Architecture environment | string | - | yes |
|
||||
| failed_requests_message | Custom message for API Management failed requests monitor | string | `` | no |
|
||||
| failed_requests_silenced | Groups to mute for API Management failed requests monitor | map | `<map>` | no |
|
||||
|
||||
@ -11,7 +11,7 @@ variable "message" {
|
||||
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "filter_tags_use_defaults" {
|
||||
|
||||
@ -39,7 +39,7 @@ resource "datadog_monitor" "apimgt_status" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "apimgt_failed_requests" {
|
||||
name = "[${var.environment}] API Management too many failed requests {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] API Management too many failed requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.failed_requests_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -72,7 +72,7 @@ resource "datadog_monitor" "apimgt_failed_requests" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "apimgt_other_requests" {
|
||||
name = "[${var.environment}] API Management too many other requests {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] API Management too many other requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.other_requests_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -105,7 +105,7 @@ resource "datadog_monitor" "apimgt_other_requests" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "apimgt_unauthorized_requests" {
|
||||
name = "[${var.environment}] API Management too many unauthorized requests {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] API Management too many unauthorized requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.unauthorized_requests_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -138,7 +138,7 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "apimgt_successful_requests" {
|
||||
name = "[${var.environment}] API Management successful requests rate too low {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] API Management successful requests rate too low {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.successful_requests_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
|
||||
@ -28,7 +28,7 @@ Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||
| environment | Architecture environment | string | - | yes |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
|
||||
@ -19,7 +19,7 @@ variable "message" {
|
||||
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
default = 900
|
||||
}
|
||||
|
||||
# Azure App Services specific variables
|
||||
|
||||
@ -8,7 +8,7 @@ data "template_file" "filter" {
|
||||
|
||||
# Monitoring App Services response time
|
||||
resource "datadog_monitor" "appservices_response_time" {
|
||||
name = "[${var.environment}] App Services response time too high {{#is_alert}}{{comparator}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
|
||||
name = "[${var.environment}] App Services response time too high {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = "${coalesce(var.response_time_message, var.message)}"
|
||||
|
||||
@ -39,7 +39,7 @@ resource "datadog_monitor" "appservices_response_time" {
|
||||
|
||||
# Monitoring App Services memory usage
|
||||
resource "datadog_monitor" "appservices_memory_usage_count" {
|
||||
name = "[${var.environment}] App Services memory usage {{#is_alert}}{{comparator}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
name = "[${var.environment}] App Services memory usage {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = "${coalesce(var.memory_usage_message, var.message)}"
|
||||
|
||||
@ -70,7 +70,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
|
||||
|
||||
# Monitoring App Services 5xx errors percent
|
||||
resource "datadog_monitor" "appservices_http_5xx_errors_count" {
|
||||
name = "[${var.environment}] App Services HTTP 5xx errors too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] App Services HTTP 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = "${coalesce(var.http_5xx_requests_message, var.message)}"
|
||||
|
||||
@ -102,7 +102,7 @@ resource "datadog_monitor" "appservices_http_5xx_errors_count" {
|
||||
|
||||
# Monitoring App Services 4xx errors percent
|
||||
resource "datadog_monitor" "appservices_http_4xx_errors_count" {
|
||||
name = "[${var.environment}] App Services HTTP 4xx errors too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] App Services HTTP 4xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = "${coalesce(var.http_4xx_requests_message, var.message)}"
|
||||
|
||||
@ -134,7 +134,7 @@ resource "datadog_monitor" "appservices_http_4xx_errors_count" {
|
||||
|
||||
# Monitoring App Services HTTP 2xx & 3xx status pages percent
|
||||
resource "datadog_monitor" "appservices_http_success_status_rate" {
|
||||
name = "[${var.environment}] App Services HTTP successful responses too low {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] App Services HTTP successful responses too low {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = "${coalesce(var.http_successful_requests_message, var.message)}"
|
||||
|
||||
|
||||
@ -27,7 +27,7 @@ Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||
| environment | Architecture environment | string | - | yes |
|
||||
| errors_rate_message | Custom message for Event Hub errors monitor | string | `` | no |
|
||||
| errors_rate_silenced | Groups to mute for Event Hub errors monitor | map | `<map>` | no |
|
||||
|
||||
@ -11,7 +11,7 @@ variable "message" {
|
||||
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "filter_tags_use_defaults" {
|
||||
|
||||
@ -33,7 +33,7 @@ resource "datadog_monitor" "eventhub_status" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "eventhub_failed_requests" {
|
||||
name = "[${var.environment}] Event Hub too many failed requests {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Event Hub too many failed requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.failed_requests_rate_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -69,7 +69,7 @@ resource "datadog_monitor" "eventhub_failed_requests" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "eventhub_errors" {
|
||||
name = "[${var.environment}] Event Hub too many errors {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Event Hub too many errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.errors_rate_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
|
||||
@ -10,7 +10,7 @@ variable "message" {
|
||||
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "filter_tags_use_defaults" {
|
||||
|
||||
@ -38,7 +38,7 @@ Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||
| dropped_d2c_telemetry_egress_message | Custom message for IoT Hub dropped d2c telemetry monitor | string | `` | no |
|
||||
| dropped_d2c_telemetry_egress_rate_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `90` | no |
|
||||
| dropped_d2c_telemetry_egress_rate_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `50` | no |
|
||||
|
||||
@ -7,7 +7,7 @@ variable "environment" {
|
||||
# Global DataDog
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
resource "datadog_monitor" "too_many_jobs_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many jobs failed {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] IOT Hub Too many jobs failed {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.failed_jobs_rate_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -34,7 +34,7 @@ resource "datadog_monitor" "too_many_jobs_failed" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_list_jobs_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many list_jobs failure {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] IOT Hub Too many list_jobs failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.failed_listjobs_rate_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -69,7 +69,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_query_jobs_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many query_jobs failed {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] IOT Hub Too many query_jobs failed {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.failed_queryjobs_rate_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -130,7 +130,7 @@ resource "datadog_monitor" "status" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "total_devices" {
|
||||
name = "[${var.environment}] IOT Hub Total devices is wrong {{#is_alert}}{{comparator}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
name = "[${var.environment}] IOT Hub Total devices is wrong {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = "${coalesce(var.total_devices_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -156,7 +156,7 @@ resource "datadog_monitor" "total_devices" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_c2d_methods_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many c2d methods failure {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] IOT Hub Too many c2d methods failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.failed_c2d_methods_rate_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -191,7 +191,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many c2d twin read failure {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] IOT Hub Too many c2d twin read failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.failed_c2d_twin_read_rate_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -226,7 +226,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many c2d twin update failure {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] IOT Hub Too many c2d twin update failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.failed_c2d_twin_update_rate_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -261,7 +261,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c twin read failure {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] IOT Hub Too many d2c twin read failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.failed_d2c_twin_read_rate_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -296,7 +296,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c twin update failure {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] IOT Hub Too many d2c twin update failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.failed_d2c_twin_update_rate_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -331,7 +331,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped {{#is_alert}}{{comparator}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = "${coalesce(var.dropped_d2c_telemetry_egress_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -369,7 +369,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned {{#is_alert}}{{comparator}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = "${coalesce(var.orphaned_d2c_telemetry_egress_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -407,7 +407,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid {{#is_alert}}{{comparator}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = "${coalesce(var.invalid_d2c_telemetry_egress_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -445,7 +445,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress not sent {{#is_alert}}{{comparator}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress not sent {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = "${coalesce(var.too_many_d2c_telemetry_ingress_nosent_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
|
||||
@ -27,7 +27,7 @@ Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||
| environment | Architecture environment | string | - | yes |
|
||||
| evictedkeys_limit_message | Custom message for Redis evicted keys monitor | string | `` | no |
|
||||
| evictedkeys_limit_silenced | Groups to mute for Redis evicted keys monitor | map | `<map>` | no |
|
||||
|
||||
@ -11,7 +11,7 @@ variable "message" {
|
||||
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "filter_tags_use_defaults" {
|
||||
|
||||
@ -33,7 +33,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "evictedkeys" {
|
||||
name = "[${var.environment}] Redis too many evictedkeys {{#is_alert}}{{comparator}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
name = "[${var.environment}] Redis too many evictedkeys {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = "${coalesce(var.evictedkeys_limit_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -66,7 +66,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "percent_processor_time" {
|
||||
name = "[${var.environment}] Redis processor time too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Redis processor time too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.percent_processor_time_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -99,7 +99,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "server_load" {
|
||||
name = "[${var.environment}] Redis server load too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Redis server load too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.server_load_rate_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
|
||||
@ -34,7 +34,7 @@ Inputs
|
||||
| deadlock_message | Custom message for SQL Deadlock monitor | string | `` | no |
|
||||
| deadlock_silenced | Groups to mute for SQL Deadlock monitor | map | `<map>` | no |
|
||||
| deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||
| diskspace_message | Custom message for SQL disk space monitor | string | `` | no |
|
||||
| diskspace_silenced | Groups to mute for SQL disk space monitor | map | `<map>` | no |
|
||||
| diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no |
|
||||
|
||||
@ -7,7 +7,7 @@ variable "environment" {
|
||||
# Global DataDog
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
|
||||
@ -7,7 +7,7 @@ data "template_file" "filter" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "sql-database_cpu_90_15min" {
|
||||
name = "[${var.environment}] SQL Database CPU too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] SQL Database CPU too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.cpu_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -39,7 +39,7 @@ resource "datadog_monitor" "sql-database_cpu_90_15min" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "sql-database_free_space_low" {
|
||||
name = "[${var.environment}] SQL Database low free space {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] SQL Database low free space {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.diskspace_message, var.message)}"
|
||||
|
||||
type = "metric alert"
|
||||
@ -72,7 +72,7 @@ resource "datadog_monitor" "sql-database_free_space_low" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "sql-database_dtu_consumption_high" {
|
||||
name = "[${var.environment}] SQL Database DTU Consumption too high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] SQL Database DTU Consumption too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.dtu_message, var.message)}"
|
||||
|
||||
type = "metric alert"
|
||||
@ -105,7 +105,7 @@ resource "datadog_monitor" "sql-database_dtu_consumption_high" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "sql-database_deadlocks_count" {
|
||||
name = "[${var.environment}] SQL Database Deadlocks too high {{#is_alert}}{{comparator}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
name = "[${var.environment}] SQL Database Deadlocks too high {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = "${coalesce(var.deadlock_message, var.message)}"
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
@ -44,7 +44,7 @@ Inputs
|
||||
| client_other_error_requests_silenced | Groups to mute for Storage other errors monitor | map | `<map>` | no |
|
||||
| client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `90` | no |
|
||||
| client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `50` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||
| environment | Architecture environment | string | - | yes |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
|
||||
@ -11,7 +11,7 @@ variable "message" {
|
||||
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "filter_tags_use_defaults" {
|
||||
|
||||
@ -39,7 +39,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "successful_requests" {
|
||||
name = "[${var.environment}] Azure Storage too few successful requests {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Azure Storage too few successful requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.successful_requests_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -71,7 +71,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "latency" {
|
||||
name = "[${var.environment}] Azure Storage too high end to end latency {{#is_alert}}{{comparator}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
|
||||
name = "[${var.environment}] Azure Storage too high end to end latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
|
||||
message = "${coalesce(var.latency_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -103,7 +103,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "timeout_error_requests" {
|
||||
name = "[${var.environment}] Azure Storage too many timeout errors {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Azure Storage too many timeout errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.timeout_error_requests_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -135,7 +135,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "network_error_requests" {
|
||||
name = "[${var.environment}] Azure Storage too many network errors {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Azure Storage too many network errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.network_error_requests_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -167,7 +167,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "throttling_error_requests" {
|
||||
name = "[${var.environment}] Azure Storage too many throttling errors {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Azure Storage too many throttling errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.throttling_error_requests_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -199,7 +199,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "server_other_error_requests" {
|
||||
name = "[${var.environment}] Azure Storage too many server_other errors {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Azure Storage too many server_other errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.server_other_error_requests_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -231,7 +231,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "client_other_error_requests" {
|
||||
name = "[${var.environment}] Azure Storage too many client_other errors {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Azure Storage too many client_other errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.client_other_error_requests_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -263,7 +263,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "authorization_error_requests" {
|
||||
name = "[${var.environment}] Azure Storage too many authorization errors {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Azure Storage too many authorization errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.authorization_error_requests_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
|
||||
@ -23,7 +23,7 @@ Inputs
|
||||
| conversion_errors_silenced | Groups to mute for Stream Analytics conversion errors monitor | map | `<map>` | no |
|
||||
| conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no |
|
||||
| conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `900` | no |
|
||||
| environment | Architecture environment | string | - | yes |
|
||||
| failed_function_requests_message | Custom message for Stream Analytics failed requests monitor | string | `` | no |
|
||||
| failed_function_requests_silenced | Groups to mute for Stream Analytics failed requests monitor | map | `<map>` | no |
|
||||
|
||||
@ -11,7 +11,7 @@ variable "message" {
|
||||
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "filter_tags_use_defaults" {
|
||||
|
||||
@ -33,7 +33,7 @@ resource "datadog_monitor" "status" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "su_utilization" {
|
||||
name = "[${var.environment}] Stream Analytics streaming units utilization too high {{#is_alert}}{{comparator}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
name = "[${var.environment}] Stream Analytics streaming units utilization too high {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = "${coalesce(var.su_utilization_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -66,7 +66,7 @@ resource "datadog_monitor" "su_utilization" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "failed_function_requests" {
|
||||
name = "[${var.environment}] Stream Analytics too many failed requests {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Stream Analytics too many failed requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.failed_function_requests_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -100,7 +100,7 @@ resource "datadog_monitor" "failed_function_requests" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "conversion_errors" {
|
||||
name = "[${var.environment}] Stream Analytics too many conversion errors {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Stream Analytics too many conversion errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.conversion_errors_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -133,7 +133,7 @@ resource "datadog_monitor" "conversion_errors" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "runtime_errors" {
|
||||
name = "[${var.environment}] Stream Analytics too many runtime errors {{#is_alert}}{{comparator}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
name = "[${var.environment}] Stream Analytics too many runtime errors {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = "${coalesce(var.runtime_errors_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
|
||||
@ -75,7 +75,7 @@ Inputs
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| environment | Architecture Environment | string | - | yes |
|
||||
| evaluation_delay | Delay in seconds for the metric evaluation | string | `15` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| message | Message sent when an alert is triggered | string | - | yes |
|
||||
|
||||
@ -5,7 +5,7 @@ variable "environment" {
|
||||
}
|
||||
|
||||
# Global DataDog
|
||||
variable "evaluation_delay" {
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 15
|
||||
}
|
||||
|
||||
@ -20,8 +20,8 @@ resource "datadog_monitor" "mongodb_replicaset_state" {
|
||||
|
||||
notify_no_data = true
|
||||
renotify_interval = 0
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
|
||||
98
inputs.tf
98
inputs.tf
@ -1,98 +0,0 @@
|
||||
variable "hno_escalation_group" {}
|
||||
variable "ho_escalation_group" {}
|
||||
|
||||
variable env {}
|
||||
variable region {}
|
||||
|
||||
##linux
|
||||
variable "dd_system" {
|
||||
default = "disabled"
|
||||
}
|
||||
|
||||
variable "linux_basics_config" {
|
||||
type = "map"
|
||||
|
||||
default = {
|
||||
notify_no_data = false
|
||||
delay = 900
|
||||
}
|
||||
}
|
||||
|
||||
variable "dd_custom_cpu" {
|
||||
type = "map"
|
||||
|
||||
default = {
|
||||
status = "disabled"
|
||||
}
|
||||
}
|
||||
|
||||
#cpu threshold
|
||||
|
||||
variable "cpu_5_critical" {
|
||||
default = 95
|
||||
}
|
||||
|
||||
variable "cpu_15_critical" {
|
||||
default = 80
|
||||
}
|
||||
|
||||
## RDS
|
||||
variable "dd_aws_rds" {
|
||||
default = "disabled"
|
||||
}
|
||||
|
||||
variable "rds_config" {
|
||||
type = "map"
|
||||
|
||||
default = {
|
||||
notify_no_data = false
|
||||
delay = 900
|
||||
}
|
||||
}
|
||||
|
||||
variable "rds_cpu_threshold" {
|
||||
type = "map"
|
||||
|
||||
default = {
|
||||
warning = 80
|
||||
critical = 90
|
||||
}
|
||||
}
|
||||
|
||||
variable "rds_mem_threshold" {
|
||||
default = {
|
||||
warning = 20
|
||||
critical = 10
|
||||
}
|
||||
}
|
||||
|
||||
##apache nginx php
|
||||
variable "dd_nginx" {
|
||||
default = "disabled"
|
||||
}
|
||||
|
||||
variable "dd_php_fpm" {
|
||||
default = "disabled"
|
||||
}
|
||||
|
||||
variable "dd_apache" {
|
||||
default = "disabled"
|
||||
}
|
||||
|
||||
variable "apache_nginx_fpm_config" {
|
||||
type = "map"
|
||||
|
||||
default = {
|
||||
notify_no_data = false
|
||||
delay = 900
|
||||
}
|
||||
}
|
||||
|
||||
variable "php_fpm_busy_threshold" {
|
||||
type = "map"
|
||||
|
||||
default = {
|
||||
warning = 0.8
|
||||
critical = 0.9
|
||||
}
|
||||
}
|
||||
@ -17,7 +17,7 @@ Purpose
|
||||
-------
|
||||
Creates a DataDog monitors with the following checks :
|
||||
|
||||
* Apache process
|
||||
* Apache connect
|
||||
|
||||
Inputs
|
||||
------
|
||||
@ -26,8 +26,8 @@ Inputs
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| apache_connect_message | Custom message for Apache process monitor | string | `` | no |
|
||||
| apache_connect_silenced | Groups to mute for Apache process monitor | map | `<map>` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
|
||||
| environment | Architecture Environment | string | - | yes |
|
||||
| evaluation_delay | Delay in seconds for the metric evaluation | string | `15` | no |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| message | Message sent when an alert is triggered | string | - | yes |
|
||||
| message | Message sent when an alert is triggered | string | - | yes |
|
||||
|
||||
@ -5,7 +5,7 @@ variable "environment" {
|
||||
}
|
||||
|
||||
# Global DataDog
|
||||
variable "evaluation_delay" {
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 15
|
||||
}
|
||||
|
||||
@ -23,8 +23,8 @@ resource "datadog_monitor" "datadog_apache_process" {
|
||||
}
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
|
||||
@ -17,7 +17,7 @@ Purpose
|
||||
-------
|
||||
Creates a DataDog monitors with the following checks :
|
||||
|
||||
* Nginx process
|
||||
* Nginx connect
|
||||
|
||||
Inputs
|
||||
------
|
||||
@ -25,9 +25,9 @@ Inputs
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| environment | Architecture Environment | string | - | yes |
|
||||
| evaluation_delay | Delay in seconds for the metric evaluation | string | `15` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| message | Message sent when an alert is triggered | string | - | yes |
|
||||
| nginx_connect_message | Custom message for Nginx process monitor | string | `` | no |
|
||||
| nginx_connect_silenced | Groups to mute for Nginx process monitor | map | `<map>` | no |
|
||||
| nginx_connect_silenced | Groups to mute for Nginx process monitor | map | `<map>` | no |
|
||||
|
||||
@ -5,7 +5,7 @@ variable "environment" {
|
||||
}
|
||||
|
||||
# Global DataDog
|
||||
variable "evaluation_delay" {
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 15
|
||||
}
|
||||
|
||||
@ -23,8 +23,8 @@ resource "datadog_monitor" "datadog_nginx_process" {
|
||||
}
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
|
||||
@ -17,7 +17,7 @@ Purpose
|
||||
-------
|
||||
Creates a DataDog monitors with the following checks :
|
||||
|
||||
* PHP FPM process
|
||||
* PHP FPM connect
|
||||
* PHP FPM load
|
||||
|
||||
Inputs
|
||||
@ -26,8 +26,7 @@ Inputs
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| environment | Architecture Environment | string | - | yes |
|
||||
| evaluation_delay_metric | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| evaluation_delay_service | Delay in seconds for the metric evaluation | string | `15` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| message | Message sent when an alert is triggered | string | - | yes |
|
||||
@ -36,4 +35,4 @@ Inputs
|
||||
| php_fpm_busy_threshold_critical | php fpm busy critical threshold | string | `0.9` | no |
|
||||
| php_fpm_busy_threshold_warning | php fpm busy warning threshold | string | `0.8` | no |
|
||||
| php_fpm_connect_message | Custom message for PHP FPM process monitor | string | `` | no |
|
||||
| php_fpm_connect_silenced | Groups to mute for PHP FPM process monitor | map | `<map>` | no |
|
||||
| php_fpm_connect_silenced | Groups to mute for PHP FPM process monitor | map | `<map>` | no |
|
||||
|
||||
@ -5,16 +5,11 @@ variable "environment" {
|
||||
}
|
||||
|
||||
# Global DataDog
|
||||
variable "evaluation_delay_service" {
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 15
|
||||
}
|
||||
|
||||
variable "evaluation_delay_metric" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
description = "Message sent when an alert is triggered"
|
||||
}
|
||||
|
||||
@ -7,7 +7,7 @@ data "template_file" "filter" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "datadog_php_fpm_connect_idle" {
|
||||
name = "[${var.environment}] php_fpm busy worker {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] php_fpm busy worker {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.php_fpm_busy_message, var.message)}"
|
||||
|
||||
type = "metric alert"
|
||||
@ -26,8 +26,8 @@ resource "datadog_monitor" "datadog_php_fpm_connect_idle" {
|
||||
}
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.evaluation_delay_metric}"
|
||||
new_host_delay = "${var.evaluation_delay_metric}"
|
||||
evaluation_delay = "${var.delay_metric}"
|
||||
new_host_delay = "${var.delay_metric}"
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
@ -57,8 +57,8 @@ resource "datadog_monitor" "datadog_fpm_process" {
|
||||
}
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.evaluation_delay_service}"
|
||||
new_host_delay = "${var.evaluation_delay_service}"
|
||||
evaluation_delay = "${var.delay_service}"
|
||||
new_host_delay = "${var.delay_service}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
|
||||
@ -17,7 +17,11 @@ Purpose
|
||||
-------
|
||||
Creates a DataDog monitors with the following checks :
|
||||
|
||||
* System CPU High
|
||||
* CPU usage
|
||||
* CPU load ratio
|
||||
* Free memory
|
||||
* Free disk inodes
|
||||
* Free disk space
|
||||
|
||||
Inputs
|
||||
------
|
||||
@ -29,8 +33,13 @@ Inputs
|
||||
| cpu_high_threshold_critical | CPU high critical threshold | string | `95` | no |
|
||||
| cpu_high_threshold_warning | CPU high warning threshold | string | `80` | no |
|
||||
| cpu_high_timeframe | CPU high timeframe | string | `last_5m` | no |
|
||||
| cpu_load_message | Custom message for CPU load ratio monitor | string | `` | no |
|
||||
| cpu_load_silenced | Groups to mute for CPU load ratio monitor | map | `<map>` | no |
|
||||
| cpu_load_threshold_critical | CPU load ratio critical threshold | string | `4` | no |
|
||||
| cpu_load_threshold_warning | CPU load ratio warning threshold | string | `3` | no |
|
||||
| cpu_load_timeframe | CPU load timeframe | string | `last_5m` | no |
|
||||
| environment | Architecture Environment | string | - | yes |
|
||||
| evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| free_disk_inodes_message | Custom message for Free disk inodes monitor | string | `` | no |
|
||||
|
||||
@ -5,9 +5,9 @@ variable "environment" {
|
||||
}
|
||||
|
||||
# Global DataDog
|
||||
variable "evaluation_delay" {
|
||||
variable "delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 600
|
||||
default = 15
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
@ -53,6 +53,33 @@ variable "cpu_high_threshold_critical" {
|
||||
default = 95
|
||||
}
|
||||
|
||||
variable "cpu_load_silenced" {
|
||||
description = "Groups to mute for CPU load ratio monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "cpu_load_message" {
|
||||
description = "Custom message for CPU load ratio monitor"
|
||||
type = "string"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "cpu_load_timeframe" {
|
||||
description = "CPU load ratio timeframe"
|
||||
default = "last_5m"
|
||||
}
|
||||
|
||||
variable "cpu_load_threshold_warning" {
|
||||
description = "CPU load ratio warning threshold"
|
||||
default = 3
|
||||
}
|
||||
|
||||
variable "cpu_load_threshold_critical" {
|
||||
description = "CPU load ratio critical threshold"
|
||||
default = 4
|
||||
}
|
||||
|
||||
variable "free_disk_space_silenced" {
|
||||
description = "Groups to mute for Free diskspace monitor"
|
||||
type = "map"
|
||||
|
||||
@ -7,7 +7,7 @@ data "template_file" "filter" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "datadog_cpu_too_high" {
|
||||
name = "[${var.environment}] CPU usage {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] CPU usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.cpu_high_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -26,8 +26,8 @@ resource "datadog_monitor" "datadog_cpu_too_high" {
|
||||
tags = ["env:${var.environment}", "type:system", "resource:cpu"]
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
@ -38,8 +38,41 @@ resource "datadog_monitor" "datadog_cpu_too_high" {
|
||||
silenced = "${var.cpu_high_silenced}"
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "datadog_load_too_high" {
|
||||
name = "[${var.environment}] CPU load 5 {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.cpu_load_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
min(${var.cpu_load_timeframe}): (
|
||||
avg:system.load.5{${data.template_file.filter.rendered}} by {region,host} /
|
||||
avg:system.core.count{${data.template_file.filter.rendered}} by {region,host}
|
||||
) > ${var.cpu_load_threshold_critical}
|
||||
EOF
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
thresholds {
|
||||
warning = "${var.cpu_load_threshold_warning}"
|
||||
critical = "${var.cpu_load_threshold_critical}"
|
||||
}
|
||||
|
||||
tags = ["env:${var.environment}", "type:system", "resource:load"]
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
no_data_timeframe = 20
|
||||
|
||||
silenced = "${var.cpu_load_silenced}"
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "datadog_free_disk_space_too_low" {
|
||||
name = "[${var.environment}] Free disk space {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Free disk space {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.free_disk_space_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -59,8 +92,8 @@ resource "datadog_monitor" "datadog_free_disk_space_too_low" {
|
||||
tags = ["env:${var.environment}", "type:system", "resource:disk"]
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
@ -72,7 +105,7 @@ resource "datadog_monitor" "datadog_free_disk_space_too_low" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "datadog_free_disk_space_inodes_too_low" {
|
||||
name = "[${var.environment}] Free disk inodes {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Free disk inodes {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.free_disk_inodes_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
@ -92,8 +125,8 @@ resource "datadog_monitor" "datadog_free_disk_space_inodes_too_low" {
|
||||
tags = ["env:${var.environment}", "type:system", "resource:disk"]
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
@ -105,7 +138,7 @@ resource "datadog_monitor" "datadog_free_disk_space_inodes_too_low" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "datadog_free_memory" {
|
||||
name = "[${var.environment}] Free memory {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
name = "[${var.environment}] Free memory {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${var.free_memory_message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -125,8 +158,8 @@ resource "datadog_monitor" "datadog_free_memory" {
|
||||
tags = ["env:${var.environment}", "type:system", "resource:memory"]
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.evaluation_delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user