Merge branch 'MON-326_formalize_EOQ' into 'master'
Resolve MON-326 "Formalize eoq" Closes MON-326 See merge request claranet/pt-monitoring/projects/datadog/terraform/monitors!44
This commit is contained in:
commit
b9d13f26d0
@ -4,9 +4,9 @@ resource "datadog_monitor" "ark_schedules_monitor" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.ark_schedules_monitor_message, var.message)}"
|
message = "${coalesce(var.ark_schedules_monitor_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
sum(${var.ark_schedules_monitor_timeframe}):min:ark.ark_backup_failure_total${module.filter-tags.query_alert} by {schedule}.as_count() > 1
|
sum(${var.ark_schedules_monitor_timeframe}):min:ark.ark_backup_failure_total${module.filter-tags.query_alert} by {schedule}.as_count() > 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = 1
|
critical = 1
|
||||||
|
|||||||
@ -3,12 +3,12 @@ resource "datadog_monitor" "nginx_ingress_too_many_5xx" {
|
|||||||
name = "[${var.environment}] Nginx Ingress 5xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Nginx Ingress 5xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.ingress_5xx_message, var.message)}"
|
message = "${coalesce(var.ingress_5xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.ingress_5xx_time_aggregator}(${var.ingress_5xx_timeframe}): default(
|
${var.ingress_5xx_time_aggregator}(${var.ingress_5xx_timeframe}): default(
|
||||||
sum:nginx_ingress.nginx_upstream_responses_total${module.filter-tags-5xx.query_alert} by {upstream,ingress_class}.as_rate() /
|
sum:nginx_ingress.nginx_upstream_responses_total${module.filter-tags-5xx.query_alert} by {upstream,ingress_class}.as_rate() /
|
||||||
(sum:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class}.as_rate() + ${var.artificial_requests_count})
|
(sum:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class}.as_rate() + ${var.artificial_requests_count})
|
||||||
* 100, 0) > ${var.ingress_5xx_threshold_critical}
|
* 100, 0) > ${var.ingress_5xx_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -37,12 +37,12 @@ resource "datadog_monitor" "nginx_ingress_too_many_4xx" {
|
|||||||
name = "[${var.environment}] Nginx Ingress 4xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Nginx Ingress 4xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.ingress_4xx_message, var.message)}"
|
message = "${coalesce(var.ingress_4xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.ingress_4xx_time_aggregator}(${var.ingress_4xx_timeframe}): default(
|
${var.ingress_4xx_time_aggregator}(${var.ingress_4xx_timeframe}): default(
|
||||||
sum:nginx_ingress.nginx_upstream_responses_total${module.filter-tags-4xx.query_alert} by {upstream,ingress_class}.as_rate() /
|
sum:nginx_ingress.nginx_upstream_responses_total${module.filter-tags-4xx.query_alert} by {upstream,ingress_class}.as_rate() /
|
||||||
(sum:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class}.as_rate() + ${var.artificial_requests_count})
|
(sum:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class}.as_rate() + ${var.artificial_requests_count})
|
||||||
* 100, 0) > ${var.ingress_4xx_threshold_critical}
|
* 100, 0) > ${var.ingress_4xx_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -4,13 +4,13 @@ resource "datadog_monitor" "ALB_no_healthy_instances" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.alb_no_healthy_instances_message, var.message)}"
|
message = "${coalesce(var.alb_no_healthy_instances_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.alb_no_healthy_instances_time_aggregator}(${var.alb_no_healthy_instances_timeframe}): (
|
${var.alb_no_healthy_instances_time_aggregator}(${var.alb_no_healthy_instances_timeframe}): (
|
||||||
sum:aws.applicationelb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} / (
|
sum:aws.applicationelb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} / (
|
||||||
sum:aws.applicationelb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} +
|
sum:aws.applicationelb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} +
|
||||||
sum:aws.applicationelb.un_healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} )
|
sum:aws.applicationelb.un_healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} )
|
||||||
) * 100 < 1
|
) * 100 < 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -37,11 +37,11 @@ resource "datadog_monitor" "ALB_latency" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.latency_message, var.message)}"
|
message = "${coalesce(var.latency_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.latency_time_aggregator}(${var.latency_timeframe}):
|
${var.latency_time_aggregator}(${var.latency_timeframe}):
|
||||||
default(avg:aws.applicationelb.target_response_time.average${module.filter-tags.query_alert} by {region,loadbalancer}, 0)
|
default(avg:aws.applicationelb.target_response_time.average${module.filter-tags.query_alert} by {region,loadbalancer}, 0)
|
||||||
> ${var.latency_threshold_critical}
|
> ${var.latency_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -68,12 +68,12 @@ resource "datadog_monitor" "ALB_httpcode_5xx" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.httpcode_alb_5xx_message, var.message)}"
|
message = "${coalesce(var.httpcode_alb_5xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.httpcode_alb_5xx_time_aggregator}(${var.httpcode_alb_5xx_timeframe}):
|
${var.httpcode_alb_5xx_time_aggregator}(${var.httpcode_alb_5xx_timeframe}):
|
||||||
default(avg:aws.applicationelb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
default(avg:aws.applicationelb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
||||||
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.httpcode_alb_5xx_threshold_critical}
|
* 100 > ${var.httpcode_alb_5xx_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -100,12 +100,12 @@ resource "datadog_monitor" "ALB_httpcode_4xx" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.httpcode_alb_4xx_message, var.message)}"
|
message = "${coalesce(var.httpcode_alb_4xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.httpcode_alb_4xx_time_aggregator}(${var.httpcode_alb_4xx_timeframe}):
|
${var.httpcode_alb_4xx_time_aggregator}(${var.httpcode_alb_4xx_timeframe}):
|
||||||
default(avg:aws.applicationelb.httpcode_elb_4xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
default(avg:aws.applicationelb.httpcode_elb_4xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
||||||
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.httpcode_alb_4xx_threshold_critical}
|
* 100 > ${var.httpcode_alb_4xx_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -132,12 +132,12 @@ resource "datadog_monitor" "ALB_httpcode_target_5xx" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.httpcode_target_5xx_message, var.message)}"
|
message = "${coalesce(var.httpcode_target_5xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.httpcode_target_5xx_time_aggregator}(${var.httpcode_target_5xx_timeframe}):
|
${var.httpcode_target_5xx_time_aggregator}(${var.httpcode_target_5xx_timeframe}):
|
||||||
default(avg:aws.applicationelb.httpcode_target_5xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
default(avg:aws.applicationelb.httpcode_target_5xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
||||||
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.httpcode_target_5xx_threshold_critical}
|
* 100 > ${var.httpcode_target_5xx_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -164,12 +164,12 @@ resource "datadog_monitor" "ALB_httpcode_target_4xx" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.httpcode_target_4xx_message, var.message)}"
|
message = "${coalesce(var.httpcode_target_4xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.httpcode_target_4xx_time_aggregator}(${var.httpcode_target_4xx_timeframe}):
|
${var.httpcode_target_4xx_time_aggregator}(${var.httpcode_target_4xx_timeframe}):
|
||||||
default(avg:aws.applicationelb.httpcode_target_4xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
default(avg:aws.applicationelb.httpcode_target_4xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
||||||
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.httpcode_target_4xx_threshold_critical}
|
* 100 > ${var.httpcode_target_4xx_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
|
|||||||
@ -5,11 +5,11 @@ resource "datadog_monitor" "API_Gateway_latency" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.latency_message, var.message)}"
|
message = "${coalesce(var.latency_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.latency_time_aggregator}(${var.latency_timeframe}):
|
${var.latency_time_aggregator}(${var.latency_timeframe}):
|
||||||
default(avg:aws.apigateway.latency{${var.filter_tags}} by {region,apiname,stage}, 0)
|
default(avg:aws.apigateway.latency{${var.filter_tags}} by {region,apiname,stage}, 0)
|
||||||
> ${var.latency_threshold_critical}
|
> ${var.latency_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -37,12 +37,12 @@ resource "datadog_monitor" "API_http_5xx_errors_count" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.http_5xx_requests_message, var.message)}"
|
message = "${coalesce(var.http_5xx_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.http_5xx_requests_time_aggregator}(${var.http_5xx_requests_timeframe}):
|
${var.http_5xx_requests_time_aggregator}(${var.http_5xx_requests_timeframe}):
|
||||||
default(avg:aws.apigateway.5xxerror{${var.filter_tags}} by {region,apiname,stage}.as_rate(), 0) / (
|
default(avg:aws.apigateway.5xxerror{${var.filter_tags}} by {region,apiname,stage}.as_rate(), 0) / (
|
||||||
default(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname,stage}.as_rate() + ${var.artificial_requests_count}, 1))
|
default(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname,stage}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.http_5xx_requests_threshold_critical}
|
* 100 > ${var.http_5xx_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -70,12 +70,12 @@ resource "datadog_monitor" "API_http_4xx_errors_count" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.http_4xx_requests_message, var.message)}"
|
message = "${coalesce(var.http_4xx_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.http_4xx_requests_time_aggregator}(${var.http_4xx_requests_timeframe}):
|
${var.http_4xx_requests_time_aggregator}(${var.http_4xx_requests_timeframe}):
|
||||||
default(avg:aws.apigateway.4xxerror{${var.filter_tags}} by {region,apiname,stage}.as_rate(), 0) / (
|
default(avg:aws.apigateway.4xxerror{${var.filter_tags}} by {region,apiname,stage}.as_rate(), 0) / (
|
||||||
default(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname,stage}.as_rate() + ${var.artificial_requests_count}, 1))
|
default(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname,stage}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.http_4xx_requests_threshold_critical}
|
* 100 > ${var.http_4xx_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
|
|||||||
@ -5,11 +5,11 @@ resource "datadog_monitor" "elasticache_eviction" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
sum(${var.eviction_timeframe}): (
|
sum(${var.eviction_timeframe}): (
|
||||||
avg:aws.elasticache.evictions${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
avg:aws.elasticache.evictions${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
||||||
) > ${var.eviction_threshold_critical}
|
) > ${var.eviction_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.eviction_threshold_warning}"
|
warning = "${var.eviction_threshold_warning}"
|
||||||
@ -38,11 +38,11 @@ resource "datadog_monitor" "elasticache_max_connection" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.max_connection_time_aggregator}(${var.max_connection_timeframe}): (
|
${var.max_connection_time_aggregator}(${var.max_connection_timeframe}): (
|
||||||
avg:aws.elasticache.curr_connections${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
avg:aws.elasticache.curr_connections${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
||||||
) >= 65000
|
) >= 65000
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
notify_no_data = true
|
notify_no_data = true
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
@ -66,11 +66,11 @@ resource "datadog_monitor" "elasticache_no_connection" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.no_connection_time_aggregator}(${var.no_connection_timeframe}): (
|
${var.no_connection_time_aggregator}(${var.no_connection_timeframe}): (
|
||||||
avg:aws.elasticache.curr_connections${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
avg:aws.elasticache.curr_connections${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
||||||
) <= 0
|
) <= 0
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
notify_no_data = false
|
notify_no_data = false
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
@ -94,11 +94,11 @@ resource "datadog_monitor" "elasticache_swap" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.swap_time_aggregator}(${var.swap_timeframe}): (
|
${var.swap_time_aggregator}(${var.swap_timeframe}): (
|
||||||
avg:aws.elasticache.swap_usage${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
avg:aws.elasticache.swap_usage${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
||||||
) > ${var.swap_threshold_critical}
|
) > ${var.swap_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.swap_threshold_warning}"
|
warning = "${var.swap_threshold_warning}"
|
||||||
@ -127,11 +127,11 @@ resource "datadog_monitor" "elasticache_free_memory" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
pct_change(avg(${var.free_memory_timeframe}),${var.free_memory_condition_timeframe}):
|
pct_change(avg(${var.free_memory_timeframe}),${var.free_memory_condition_timeframe}):
|
||||||
avg:aws.elasticache.freeable_memory${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
avg:aws.elasticache.freeable_memory${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
||||||
< ${var.free_memory_threshold_critical}
|
< ${var.free_memory_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.free_memory_threshold_warning}"
|
warning = "${var.free_memory_threshold_warning}"
|
||||||
@ -160,11 +160,11 @@ resource "datadog_monitor" "elasticache_eviction_growing" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
pct_change(avg(${var.eviction_growing_timeframe}),${var.eviction_growing_condition_timeframe}):
|
pct_change(avg(${var.eviction_growing_timeframe}),${var.eviction_growing_condition_timeframe}):
|
||||||
avg:aws.elasticache.evictions${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
avg:aws.elasticache.evictions${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
||||||
> ${var.eviction_growing_threshold_critical}
|
> ${var.eviction_growing_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.eviction_growing_threshold_warning}"
|
warning = "${var.eviction_growing_threshold_warning}"
|
||||||
|
|||||||
@ -5,13 +5,13 @@ resource "datadog_monitor" "memcached_get_hits" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.get_hits_time_aggregator}(${var.get_hits_timeframe}): (
|
${var.get_hits_time_aggregator}(${var.get_hits_timeframe}): (
|
||||||
default(avg:aws.elasticache.get_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0) / (
|
default(avg:aws.elasticache.get_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0) / (
|
||||||
default(avg:aws.elasticache.get_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0) +
|
default(avg:aws.elasticache.get_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0) +
|
||||||
default(avg:aws.elasticache.get_misses${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0))
|
default(avg:aws.elasticache.get_misses${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0))
|
||||||
) * 100 < ${var.get_hits_threshold_critical}
|
) * 100 < ${var.get_hits_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.get_hits_threshold_warning}"
|
warning = "${var.get_hits_threshold_warning}"
|
||||||
@ -40,11 +40,11 @@ resource "datadog_monitor" "memcached_cpu_high" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cpu_high_time_aggregator}(${var.cpu_high_timeframe}): (
|
${var.cpu_high_time_aggregator}(${var.cpu_high_timeframe}): (
|
||||||
avg:aws.elasticache.cpuutilization${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
avg:aws.elasticache.cpuutilization${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
||||||
) > ${var.cpu_high_threshold_critical}
|
) > ${var.cpu_high_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.cpu_high_threshold_warning}"
|
warning = "${var.cpu_high_threshold_warning}"
|
||||||
|
|||||||
@ -5,13 +5,13 @@ resource "datadog_monitor" "redis_cache_hits" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cache_hits_time_aggregator}(${var.cache_hits_timeframe}): default(
|
${var.cache_hits_time_aggregator}(${var.cache_hits_timeframe}): default(
|
||||||
avg:aws.elasticache.cache_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate() / (
|
avg:aws.elasticache.cache_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate() / (
|
||||||
avg:aws.elasticache.cache_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate() +
|
avg:aws.elasticache.cache_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate() +
|
||||||
avg:aws.elasticache.cache_misses${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate())
|
avg:aws.elasticache.cache_misses${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate())
|
||||||
* 100, 100) < ${var.cache_hits_threshold_critical}
|
* 100, 100) < ${var.cache_hits_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.cache_hits_threshold_warning}"
|
warning = "${var.cache_hits_threshold_warning}"
|
||||||
@ -40,11 +40,11 @@ resource "datadog_monitor" "redis_cpu_high" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cpu_high_time_aggregator}(${var.cpu_high_timeframe}): (
|
${var.cpu_high_time_aggregator}(${var.cpu_high_timeframe}): (
|
||||||
avg:aws.elasticache.engine_cpuutilization${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
avg:aws.elasticache.engine_cpuutilization${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
||||||
) > ${var.cpu_high_threshold_critical}
|
) > ${var.cpu_high_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
notify_no_data = true
|
notify_no_data = true
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
@ -68,11 +68,11 @@ resource "datadog_monitor" "redis_replication_lag" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.replication_lag_time_aggregator}(${var.replication_lag_timeframe}): (
|
${var.replication_lag_time_aggregator}(${var.replication_lag_timeframe}): (
|
||||||
avg:aws.elasticache.replication_lag${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
avg:aws.elasticache.replication_lag${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
||||||
) > ${var.replication_lag_threshold_critical}
|
) > ${var.replication_lag_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.replication_lag_threshold_warning}"
|
warning = "${var.replication_lag_threshold_warning}"
|
||||||
@ -101,12 +101,12 @@ resource "datadog_monitor" "redis_commands" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
sum(${var.commands_timeframe}): (
|
sum(${var.commands_timeframe}): (
|
||||||
avg:aws.elasticache.get_type_cmds${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count() +
|
avg:aws.elasticache.get_type_cmds${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count() +
|
||||||
avg:aws.elasticache.set_type_cmds${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count()
|
avg:aws.elasticache.set_type_cmds${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count()
|
||||||
) <= 0
|
) <= 0
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
notify_no_data = false
|
notify_no_data = false
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
|
|||||||
@ -10,12 +10,12 @@ resource "datadog_monitor" "es_cluster_status" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
max(${var.es_cluster_status_timeframe}): (
|
max(${var.es_cluster_status_timeframe}): (
|
||||||
avg:aws.es.cluster_statusred${module.filter-tags.query_alert} by {region,name} * 2 +
|
avg:aws.es.cluster_statusred${module.filter-tags.query_alert} by {region,name} * 2 +
|
||||||
(avg:aws.es.cluster_statusyellow${module.filter-tags.query_alert} by {region,name} + 0.1)
|
(avg:aws.es.cluster_statusyellow${module.filter-tags.query_alert} by {region,name} + 0.1)
|
||||||
) >= 2
|
) >= 2
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = 1
|
warning = 1
|
||||||
@ -45,12 +45,12 @@ resource "datadog_monitor" "es_free_space_low" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.diskspace_time_aggregator}(${var.diskspace_timeframe}): (
|
${var.diskspace_time_aggregator}(${var.diskspace_timeframe}): (
|
||||||
avg:aws.es.free_storage_space${module.filter-tags.query_alert} by {region,name} /
|
avg:aws.es.free_storage_space${module.filter-tags.query_alert} by {region,name} /
|
||||||
(${var.es_cluster_volume_size}*1000) * 100
|
(${var.es_cluster_volume_size}*1000) * 100
|
||||||
) < ${var.diskspace_threshold_critical}
|
) < ${var.diskspace_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.diskspace_threshold_warning}"
|
warning = "${var.diskspace_threshold_warning}"
|
||||||
@ -80,11 +80,11 @@ resource "datadog_monitor" "es_cpu_90_15min" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cpu_time_aggregator}(${var.cpu_timeframe}): (
|
${var.cpu_time_aggregator}(${var.cpu_timeframe}): (
|
||||||
avg:aws.es.cpuutilization${module.filter-tags.query_alert} by {region,name}
|
avg:aws.es.cpuutilization${module.filter-tags.query_alert} by {region,name}
|
||||||
) > ${var.cpu_threshold_critical}
|
) > ${var.cpu_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.cpu_threshold_warning}"
|
warning = "${var.cpu_threshold_warning}"
|
||||||
|
|||||||
@ -3,13 +3,13 @@ resource "datadog_monitor" "ELB_no_healthy_instances" {
|
|||||||
name = "[${var.environment}] ELB healthy instances {{#is_alert}}is at 0{{/is_alert}}{{#is_warning}}is at {{value}}%{{/is_warning}}"
|
name = "[${var.environment}] ELB healthy instances {{#is_alert}}is at 0{{/is_alert}}{{#is_warning}}is at {{value}}%{{/is_warning}}"
|
||||||
message = "${coalesce(var.elb_no_healthy_instance_message, var.message)}"
|
message = "${coalesce(var.elb_no_healthy_instance_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.elb_no_healthy_instance_time_aggregator}(${var.elb_no_healthy_instance_timeframe}): (
|
${var.elb_no_healthy_instance_time_aggregator}(${var.elb_no_healthy_instance_timeframe}): (
|
||||||
sum:aws.elb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} / (
|
sum:aws.elb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} / (
|
||||||
sum:aws.elb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} +
|
sum:aws.elb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} +
|
||||||
sum:aws.elb.un_healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} )
|
sum:aws.elb.un_healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} )
|
||||||
) * 100 < 1
|
) * 100 < 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -38,12 +38,12 @@ resource "datadog_monitor" "ELB_too_much_4xx" {
|
|||||||
name = "[${var.environment}] ELB 4xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] ELB 4xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.elb_4xx_message, var.message)}"
|
message = "${coalesce(var.elb_4xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
sum(${var.elb_4xx_timeframe}):
|
sum(${var.elb_4xx_timeframe}):
|
||||||
default(avg:aws.elb.httpcode_elb_4xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
default(avg:aws.elb.httpcode_elb_4xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
||||||
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.elb_4xx_threshold_critical}
|
* 100 > ${var.elb_4xx_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -72,12 +72,12 @@ resource "datadog_monitor" "ELB_too_much_5xx" {
|
|||||||
name = "[${var.environment}] ELB 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] ELB 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.elb_5xx_message, var.message)}"
|
message = "${coalesce(var.elb_5xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
sum(${var.elb_5xx_timeframe}):
|
sum(${var.elb_5xx_timeframe}):
|
||||||
default(avg:aws.elb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
default(avg:aws.elb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
||||||
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.elb_5xx_threshold_critical}
|
* 100 > ${var.elb_5xx_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -106,12 +106,12 @@ resource "datadog_monitor" "ELB_too_much_4xx_backend" {
|
|||||||
name = "[${var.environment}] ELB backend 4xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] ELB backend 4xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.elb_backend_4xx_message, var.message)}"
|
message = "${coalesce(var.elb_backend_4xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
sum(${var.elb_backend_4xx_timeframe}):
|
sum(${var.elb_backend_4xx_timeframe}):
|
||||||
default(avg:aws.elb.httpcode_backend_4xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
default(avg:aws.elb.httpcode_backend_4xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
||||||
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.elb_backend_4xx_threshold_critical}
|
* 100 > ${var.elb_backend_4xx_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -140,12 +140,12 @@ resource "datadog_monitor" "ELB_too_much_5xx_backend" {
|
|||||||
name = "[${var.environment}] ELB backend 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] ELB backend 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.elb_backend_5xx_message, var.message)}"
|
message = "${coalesce(var.elb_backend_5xx_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
sum(${var.elb_backend_5xx_timeframe}):
|
sum(${var.elb_backend_5xx_timeframe}):
|
||||||
default(avg:aws.elb.httpcode_backend_5xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
default(avg:aws.elb.httpcode_backend_5xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
||||||
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
* 100 > ${var.elb_backend_5xx_threshold_critical}
|
* 100 > ${var.elb_backend_5xx_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -174,11 +174,11 @@ resource "datadog_monitor" "ELB_backend_latency" {
|
|||||||
name = "[${var.environment}] ELB latency too high {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
|
name = "[${var.environment}] ELB latency too high {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
|
||||||
message = "${coalesce(var.elb_backend_latency_message, var.message)}"
|
message = "${coalesce(var.elb_backend_latency_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.elb_backend_latency_time_aggregator}(${var.elb_backend_latency_timeframe}):
|
${var.elb_backend_latency_time_aggregator}(${var.elb_backend_latency_timeframe}):
|
||||||
default(avg:aws.elb.latency${module.filter-tags.query_alert} by {region,loadbalancername}, 0)
|
default(avg:aws.elb.latency${module.filter-tags.query_alert} by {region,loadbalancername}, 0)
|
||||||
> ${var.elb_backend_latency_critical}
|
> ${var.elb_backend_latency_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -6,11 +6,11 @@ resource "datadog_monitor" "firehose_incoming_records" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
sum(${var.incoming_records_timeframe}): (
|
sum(${var.incoming_records_timeframe}): (
|
||||||
avg:aws.firehose.incoming_records${module.filter-tags.query_alert} by {region,deliverystreamname}
|
avg:aws.firehose.incoming_records${module.filter-tags.query_alert} by {region,deliverystreamname}
|
||||||
) <= 0
|
) <= 0
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = 0
|
critical = 0
|
||||||
|
|||||||
@ -6,11 +6,11 @@ resource "datadog_monitor" "rds_aurora_mysql_replica_lag" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
avg(${var.aurora_replicalag_timeframe}): (
|
avg(${var.aurora_replicalag_timeframe}): (
|
||||||
avg:aws.rds.aurora_replica_lag${module.filter-tags.query_alert} by {region,name}
|
avg:aws.rds.aurora_replica_lag${module.filter-tags.query_alert} by {region,name}
|
||||||
) > ${var.aurora_replicalag_threshold_critical}
|
) > ${var.aurora_replicalag_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.aurora_replicalag_threshold_warning}"
|
warning = "${var.aurora_replicalag_threshold_warning}"
|
||||||
|
|||||||
@ -6,11 +6,11 @@ resource "datadog_monitor" "rds_aurora_postgresql_replica_lag" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
avg(${var.aurora_replicalag_timeframe}): (
|
avg(${var.aurora_replicalag_timeframe}): (
|
||||||
avg:aws.rds.rdsto_aurora_postgre_sqlreplica_lag${module.filter-tags.query_alert} by {region,name}
|
avg:aws.rds.rdsto_aurora_postgre_sqlreplica_lag${module.filter-tags.query_alert} by {region,name}
|
||||||
) > ${var.aurora_replicalag_threshold_critical}
|
) > ${var.aurora_replicalag_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.aurora_replicalag_threshold_warning}"
|
warning = "${var.aurora_replicalag_threshold_warning}"
|
||||||
|
|||||||
@ -6,11 +6,11 @@ resource "datadog_monitor" "rds_cpu_90_15min" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cpu_time_aggregator}(${var.cpu_timeframe}): (
|
${var.cpu_time_aggregator}(${var.cpu_timeframe}): (
|
||||||
avg:aws.rds.cpuutilization${module.filter-tags.query_alert} by {region,name}
|
avg:aws.rds.cpuutilization${module.filter-tags.query_alert} by {region,name}
|
||||||
) > ${var.cpu_threshold_critical}
|
) > ${var.cpu_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.cpu_threshold_warning}"
|
warning = "${var.cpu_threshold_warning}"
|
||||||
@ -39,12 +39,12 @@ resource "datadog_monitor" "rds_free_space_low" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.diskspace_time_aggregator}(${var.diskspace_timeframe}): (
|
${var.diskspace_time_aggregator}(${var.diskspace_timeframe}): (
|
||||||
avg:aws.rds.free_storage_space${module.filter-tags.query_alert} by {region,name} /
|
avg:aws.rds.free_storage_space${module.filter-tags.query_alert} by {region,name} /
|
||||||
avg:aws.rds.total_storage_space${module.filter-tags.query_alert} by {region,name} * 100
|
avg:aws.rds.total_storage_space${module.filter-tags.query_alert} by {region,name} * 100
|
||||||
) < ${var.diskspace_threshold_critical}
|
) < ${var.diskspace_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.diskspace_threshold_warning}"
|
warning = "${var.diskspace_threshold_warning}"
|
||||||
@ -73,11 +73,11 @@ resource "datadog_monitor" "rds_replica_lag" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
avg(${var.replicalag_timeframe}): (
|
avg(${var.replicalag_timeframe}): (
|
||||||
avg:aws.rds.replica_lag${module.filter-tags.query_alert} by {region,name}
|
avg:aws.rds.replica_lag${module.filter-tags.query_alert} by {region,name}
|
||||||
) > ${var.replicalag_threshold_critical}
|
) > ${var.replicalag_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.replicalag_threshold_warning}"
|
warning = "${var.replicalag_threshold_warning}"
|
||||||
|
|||||||
@ -3,11 +3,11 @@ resource "datadog_monitor" "VPN_status" {
|
|||||||
name = "[${var.environment}] VPN tunnel down"
|
name = "[${var.environment}] VPN tunnel down"
|
||||||
message = "${coalesce(var.vpn_status_message, var.message)}"
|
message = "${coalesce(var.vpn_status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.vpn_status_time_aggregator}(${var.vpn_status_timeframe}): (
|
${var.vpn_status_time_aggregator}(${var.vpn_status_timeframe}): (
|
||||||
min:aws.vpn.tunnel_state{${var.filter_tags}} by {region,tunnelipaddress}
|
min:aws.vpn.tunnel_state{${var.filter_tags}} by {region,tunnelipaddress}
|
||||||
) < 1
|
) < 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -3,9 +3,9 @@ resource "datadog_monitor" "apimgt_status" {
|
|||||||
name = "[${var.environment}] API Management is down"
|
name = "[${var.environment}] API Management is down"
|
||||||
message = "${coalesce(var.status_message, var.message)}"
|
message = "${coalesce(var.status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.status_time_aggregator}(${var.status_timeframe}):avg:azure.apimanagement_service.status${module.filter-tags.query_alert} by {resource_group,region,name} < 1
|
${var.status_time_aggregator}(${var.status_timeframe}):avg:azure.apimanagement_service.status${module.filter-tags.query_alert} by {resource_group,region,name} < 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -33,12 +33,12 @@ resource "datadog_monitor" "apimgt_failed_requests" {
|
|||||||
name = "[${var.environment}] API Management too many failed requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] API Management too many failed requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.failed_requests_message, var.message)}"
|
message = "${coalesce(var.failed_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.failed_requests_time_aggregator}(${var.failed_requests_timeframe}): (
|
${var.failed_requests_time_aggregator}(${var.failed_requests_timeframe}): (
|
||||||
default(avg:azure.apimanagement_service.failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
default(avg:azure.apimanagement_service.failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
) * 100 > ${var.failed_requests_threshold_critical}
|
) * 100 > ${var.failed_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.failed_requests_threshold_critical}"
|
critical = "${var.failed_requests_threshold_critical}"
|
||||||
@ -66,12 +66,12 @@ resource "datadog_monitor" "apimgt_other_requests" {
|
|||||||
name = "[${var.environment}] API Management too many other requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] API Management too many other requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.other_requests_message, var.message)}"
|
message = "${coalesce(var.other_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.other_requests_time_aggregator}(${var.other_requests_timeframe}): (
|
${var.other_requests_time_aggregator}(${var.other_requests_timeframe}): (
|
||||||
default(avg:azure.apimanagement_service.other_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
default(avg:azure.apimanagement_service.other_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
) * 100 > ${var.other_requests_threshold_critical}
|
) * 100 > ${var.other_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.other_requests_threshold_critical}"
|
critical = "${var.other_requests_threshold_critical}"
|
||||||
@ -99,12 +99,12 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" {
|
|||||||
name = "[${var.environment}] API Management too many unauthorized requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] API Management too many unauthorized requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.unauthorized_requests_message, var.message)}"
|
message = "${coalesce(var.unauthorized_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.unauthorized_requests_time_aggregator}(${var.unauthorized_requests_timeframe}): (
|
${var.unauthorized_requests_time_aggregator}(${var.unauthorized_requests_timeframe}): (
|
||||||
default(avg:azure.apimanagement_service.unauthorized_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
default(avg:azure.apimanagement_service.unauthorized_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
) * 100 > ${var.unauthorized_requests_threshold_critical}
|
) * 100 > ${var.unauthorized_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.unauthorized_requests_threshold_critical}"
|
critical = "${var.unauthorized_requests_threshold_critical}"
|
||||||
@ -132,14 +132,14 @@ resource "datadog_monitor" "apimgt_successful_requests" {
|
|||||||
name = "[${var.environment}] API Management successful requests rate too low {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] API Management successful requests rate too low {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.successful_requests_message, var.message)}"
|
message = "${coalesce(var.successful_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.successful_requests_time_aggregator}(${var.successful_requests_timeframe}):
|
${var.successful_requests_time_aggregator}(${var.successful_requests_timeframe}):
|
||||||
default(
|
default(
|
||||||
avg:azure.apimanagement_service.successful_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() /
|
avg:azure.apimanagement_service.successful_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() /
|
||||||
avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate()
|
avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate()
|
||||||
* 100
|
* 100
|
||||||
, 100) < ${var.successful_requests_threshold_critical}
|
, 100) < ${var.successful_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.successful_requests_threshold_critical}"
|
critical = "${var.successful_requests_threshold_critical}"
|
||||||
|
|||||||
@ -5,11 +5,11 @@ resource "datadog_monitor" "appservices_response_time" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.response_time_message, var.message)}"
|
message = "${coalesce(var.response_time_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.response_time_time_aggregator}(${var.response_time_timeframe}): (
|
${var.response_time_time_aggregator}(${var.response_time_timeframe}): (
|
||||||
default(avg:azure.app_services.average_response_time${module.filter-tags.query_alert} by {resource_group,region,name,instance}, 0)
|
default(avg:azure.app_services.average_response_time${module.filter-tags.query_alert} by {resource_group,region,name,instance}, 0)
|
||||||
) > ${var.response_time_threshold_critical}
|
) > ${var.response_time_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -37,11 +37,11 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.memory_usage_message, var.message)}"
|
message = "${coalesce(var.memory_usage_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.memory_usage_time_aggregator}(${var.memory_usage_timeframe}): (
|
${var.memory_usage_time_aggregator}(${var.memory_usage_timeframe}): (
|
||||||
avg:azure.app_services.memory_working_set${module.filter-tags.query_alert} by {resource_group,region,name,instance}
|
avg:azure.app_services.memory_working_set${module.filter-tags.query_alert} by {resource_group,region,name,instance}
|
||||||
) > ${var.memory_usage_threshold_critical}
|
) > ${var.memory_usage_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -69,12 +69,12 @@ resource "datadog_monitor" "appservices_http_5xx_errors_count" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.http_5xx_requests_message, var.message)}"
|
message = "${coalesce(var.http_5xx_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.http_5xx_requests_time_aggregator}(${var.http_5xx_requests_timeframe}): (
|
${var.http_5xx_requests_time_aggregator}(${var.http_5xx_requests_timeframe}): (
|
||||||
default(avg:azure.app_services.http5xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) /
|
default(avg:azure.app_services.http5xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) /
|
||||||
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 1)
|
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 1)
|
||||||
) * 100 > ${var.http_5xx_requests_threshold_critical}
|
) * 100 > ${var.http_5xx_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -102,12 +102,12 @@ resource "datadog_monitor" "appservices_http_4xx_errors_count" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.http_4xx_requests_message, var.message)}"
|
message = "${coalesce(var.http_4xx_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.http_4xx_requests_time_aggregator}(${var.http_4xx_requests_timeframe}): (
|
${var.http_4xx_requests_time_aggregator}(${var.http_4xx_requests_timeframe}): (
|
||||||
default(avg:azure.app_services.http4xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) /
|
default(avg:azure.app_services.http4xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) /
|
||||||
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 1)
|
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 1)
|
||||||
) * 100 > ${var.http_4xx_requests_threshold_critical}
|
) * 100 > ${var.http_4xx_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -135,14 +135,14 @@ resource "datadog_monitor" "appservices_http_success_status_rate" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.http_successful_requests_message, var.message)}"
|
message = "${coalesce(var.http_successful_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.http_successful_requests_time_aggregator}(${var.http_successful_requests_timeframe}):
|
${var.http_successful_requests_time_aggregator}(${var.http_successful_requests_timeframe}):
|
||||||
default( (
|
default( (
|
||||||
(default(avg:azure.app_services.http2xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) +
|
(default(avg:azure.app_services.http2xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) +
|
||||||
default(avg:azure.app_services.http3xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) ) /
|
default(avg:azure.app_services.http3xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) ) /
|
||||||
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0)
|
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0)
|
||||||
) * 100, 100) < ${var.http_successful_requests_threshold_critical}
|
) * 100, 100) < ${var.http_successful_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
|
|||||||
@ -5,11 +5,11 @@ resource "datadog_monitor" "azure_search_latency" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.latency_message, var.message)}"
|
message = "${coalesce(var.latency_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.latency_time_aggregator}(${var.latency_timeframe}): (
|
${var.latency_time_aggregator}(${var.latency_timeframe}): (
|
||||||
avg:azure.search_searchservices.search_latency${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.search_searchservices.search_latency${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.latency_threshold_critical}
|
) > ${var.latency_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -37,11 +37,11 @@ resource "datadog_monitor" "azure_search_throttled_queries_rate" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.throttled_queries_rate_message, var.message)}"
|
message = "${coalesce(var.throttled_queries_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.throttled_queries_rate_time_aggregator}(${var.throttled_queries_rate_timeframe}): (
|
${var.throttled_queries_rate_time_aggregator}(${var.throttled_queries_rate_timeframe}): (
|
||||||
avg:azure.search_searchservices.throttled_search_queries_percentage${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.search_searchservices.throttled_search_queries_percentage${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.throttled_queries_rate_threshold_critical}
|
) > ${var.throttled_queries_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
|
|||||||
@ -4,11 +4,11 @@ resource "datadog_monitor" "cosmos_db_status" {
|
|||||||
name = "[${var.environment}] Cosmos DB is down"
|
name = "[${var.environment}] Cosmos DB is down"
|
||||||
message = "${coalesce(var.status_message, var.message)}"
|
message = "${coalesce(var.status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.status_time_aggregator}(${var.status_timeframe}):
|
${var.status_time_aggregator}(${var.status_timeframe}):
|
||||||
avg:azure.cosmosdb.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.cosmosdb.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
< 1
|
< 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -38,7 +38,7 @@ resource "datadog_monitor" "cosmos_db_4xx_requests" {
|
|||||||
message = "${coalesce(var.cosmos_db_4xx_requests_message, var.message)}"
|
message = "${coalesce(var.cosmos_db_4xx_requests_message, var.message)}"
|
||||||
|
|
||||||
# List of available status codes : https://docs.microsoft.com/en-us/rest/api/cosmos-db/http-status-codes-for-cosmosdb
|
# List of available status codes : https://docs.microsoft.com/en-us/rest/api/cosmos-db/http-status-codes-for-cosmosdb
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cosmos_db_4xx_request_time_aggregator}(${var.cosmos_db_4xx_request_timeframe}): default( (
|
${var.cosmos_db_4xx_request_time_aggregator}(${var.cosmos_db_4xx_request_timeframe}): default( (
|
||||||
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "400")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "400")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "401")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "401")} by {resource_group,region,name,collectionname}.as_rate(), 0) +
|
||||||
@ -52,7 +52,7 @@ resource "datadog_monitor" "cosmos_db_4xx_requests" {
|
|||||||
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "449")} by {resource_group,region,name,collectionname}.as_rate(), 0) ) /
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "449")} by {resource_group,region,name,collectionname}.as_rate(), 0) ) /
|
||||||
default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0)
|
default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0)
|
||||||
, 0) * 100 > ${var.cosmos_db_4xx_request_rate_threshold_critical}
|
, 0) * 100 > ${var.cosmos_db_4xx_request_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -82,13 +82,13 @@ resource "datadog_monitor" "cosmos_db_5xx_requests" {
|
|||||||
name = "[${var.environment}] Cosmos DB 5xx requests rate is high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Cosmos DB 5xx requests rate is high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.cosmos_db_5xx_requests_message, var.message)}"
|
message = "${coalesce(var.cosmos_db_5xx_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cosmos_db_5xx_request_time_aggregator}(${var.cosmos_db_5xx_request_timeframe}): default( (
|
${var.cosmos_db_5xx_request_time_aggregator}(${var.cosmos_db_5xx_request_timeframe}): default( (
|
||||||
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "500")} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0) +
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "500")} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0) +
|
||||||
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "503")} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0)) /
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "503")} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0)) /
|
||||||
default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 0)
|
default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 0)
|
||||||
, 0) * 100 > ${var.cosmos_db_5xx_request_rate_threshold_critical}
|
, 0) * 100 > ${var.cosmos_db_5xx_request_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -119,12 +119,12 @@ resource "datadog_monitor" "cosmos_db_scaling" {
|
|||||||
message = "${coalesce(var.cosmos_db_scaling_message, var.message)}"
|
message = "${coalesce(var.cosmos_db_scaling_message, var.message)}"
|
||||||
|
|
||||||
# List of available status codes : https://docs.microsoft.com/en-us/rest/api/cosmos-db/http-status-codes-for-cosmosdb
|
# List of available status codes : https://docs.microsoft.com/en-us/rest/api/cosmos-db/http-status-codes-for-cosmosdb
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cosmos_db_scaling_time_aggregator}(${var.cosmos_db_scaling_timeframe}): default(
|
${var.cosmos_db_scaling_time_aggregator}(${var.cosmos_db_scaling_timeframe}): default(
|
||||||
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0) /
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0) /
|
||||||
default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0)
|
default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0)
|
||||||
, 0) * 100 > ${var.cosmos_db_scaling_error_rate_threshold_critical}
|
, 0) * 100 > ${var.cosmos_db_scaling_error_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -4,11 +4,11 @@ resource "datadog_monitor" "datalakestore_status" {
|
|||||||
name = "[${var.environment}] Datalake Store is down"
|
name = "[${var.environment}] Datalake Store is down"
|
||||||
message = "${coalesce(var.status_message, var.message)}"
|
message = "${coalesce(var.status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.status_time_aggregator}(${var.status_timeframe}): (
|
${var.status_time_aggregator}(${var.status_timeframe}): (
|
||||||
avg:azure.datalakestore_accounts.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.datalakestore_accounts.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) < 1
|
) < 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -4,10 +4,10 @@ resource "datadog_monitor" "eventgrid_no_successful_message" {
|
|||||||
message = "${coalesce(var.no_successful_message_rate_message, var.message)}"
|
message = "${coalesce(var.no_successful_message_rate_message, var.message)}"
|
||||||
|
|
||||||
# Query is a bit weird, but we only want to check the no-data
|
# Query is a bit weird, but we only want to check the no-data
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.no_successful_message_rate_time_aggregator}(${var.no_successful_message_rate_timeframe}):
|
${var.no_successful_message_rate_time_aggregator}(${var.no_successful_message_rate_timeframe}):
|
||||||
avg:azure.eventgrid_topics.publish_success_count${module.filter-tags.query_alert} by {resource_group,region,name} < 0
|
avg:azure.eventgrid_topics.publish_success_count${module.filter-tags.query_alert} by {resource_group,region,name} < 0
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -31,7 +31,7 @@ resource "datadog_monitor" "eventgrid_failed_messages" {
|
|||||||
name = "[${var.environment}] Event Grid too many failed messages {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Event Grid too many failed messages {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.failed_messages_rate_message, var.message)}"
|
message = "${coalesce(var.failed_messages_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.failed_messages_rate_time_aggregator}(${var.failed_messages_rate_timeframe}): (default(
|
${var.failed_messages_rate_time_aggregator}(${var.failed_messages_rate_timeframe}): (default(
|
||||||
avg:azure.eventgrid_topics.publish_fail_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() /
|
avg:azure.eventgrid_topics.publish_fail_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() /
|
||||||
(avg:azure.eventgrid_topics.publish_success_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() +
|
(avg:azure.eventgrid_topics.publish_success_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() +
|
||||||
@ -39,7 +39,7 @@ resource "datadog_monitor" "eventgrid_failed_messages" {
|
|||||||
avg:azure.eventgrid_topics.unmatched_event_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate()
|
avg:azure.eventgrid_topics.unmatched_event_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate()
|
||||||
) * 100, 0)
|
) * 100, 0)
|
||||||
) > ${var.failed_messages_rate_thresold_critical}
|
) > ${var.failed_messages_rate_thresold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -68,7 +68,7 @@ resource "datadog_monitor" "eventgrid_unmatched_events" {
|
|||||||
name = "[${var.environment}] Event Grid too many unmatched events {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Event Grid too many unmatched events {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.unmatched_events_rate_message, var.message)}"
|
message = "${coalesce(var.unmatched_events_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.unmatched_events_rate_time_aggregator}(${var.unmatched_events_rate_timeframe}): (default(
|
${var.unmatched_events_rate_time_aggregator}(${var.unmatched_events_rate_timeframe}): (default(
|
||||||
avg:azure.eventgrid_topics.unmatched_event_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() /
|
avg:azure.eventgrid_topics.unmatched_event_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() /
|
||||||
(avg:azure.eventgrid_topics.publish_success_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() +
|
(avg:azure.eventgrid_topics.publish_success_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() +
|
||||||
@ -76,7 +76,7 @@ resource "datadog_monitor" "eventgrid_unmatched_events" {
|
|||||||
avg:azure.eventgrid_topics.unmatched_event_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate()
|
avg:azure.eventgrid_topics.unmatched_event_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate()
|
||||||
) * 100, 0)
|
) * 100, 0)
|
||||||
) > ${var.unmatched_events_rate_thresold_critical}
|
) > ${var.unmatched_events_rate_thresold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -3,11 +3,11 @@ resource "datadog_monitor" "eventhub_status" {
|
|||||||
name = "[${var.environment}] Event Hub is down"
|
name = "[${var.environment}] Event Hub is down"
|
||||||
message = "${coalesce(var.status_message, var.message)}"
|
message = "${coalesce(var.status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.status_time_aggregator}(${var.status_timeframe}): (
|
${var.status_time_aggregator}(${var.status_timeframe}): (
|
||||||
avg:azure.eventhub_namespaces.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.eventhub_namespaces.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) != 1
|
) != 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -31,12 +31,12 @@ resource "datadog_monitor" "eventhub_failed_requests" {
|
|||||||
name = "[${var.environment}] Event Hub too many failed requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Event Hub too many failed requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.failed_requests_rate_message, var.message)}"
|
message = "${coalesce(var.failed_requests_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.failed_requests_rate_time_aggregator}(${var.failed_requests_rate_timeframe}): (
|
${var.failed_requests_rate_time_aggregator}(${var.failed_requests_rate_timeframe}): (
|
||||||
default(avg:azure.eventhub_namespaces.failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
default(avg:azure.eventhub_namespaces.failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
default(avg:azure.eventhub_namespaces.incoming_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
default(avg:azure.eventhub_namespaces.incoming_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
) * 100 > ${var.failed_requests_rate_thresold_critical}
|
) * 100 > ${var.failed_requests_rate_thresold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -65,14 +65,14 @@ resource "datadog_monitor" "eventhub_errors" {
|
|||||||
name = "[${var.environment}] Event Hub too many errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Event Hub too many errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.errors_rate_message, var.message)}"
|
message = "${coalesce(var.errors_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.errors_rate_time_aggregator}(${var.errors_rate_timeframe}): ( (
|
${var.errors_rate_time_aggregator}(${var.errors_rate_timeframe}): ( (
|
||||||
default(avg:azure.eventhub_namespaces.internal_server_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
default(avg:azure.eventhub_namespaces.internal_server_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
default(avg:azure.eventhub_namespaces.server_busy_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
default(avg:azure.eventhub_namespaces.server_busy_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
default(avg:azure.eventhub_namespaces.other_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) ) /
|
default(avg:azure.eventhub_namespaces.other_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) ) /
|
||||||
default(avg:azure.eventhub_namespaces.incoming_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
default(avg:azure.eventhub_namespaces.incoming_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
) * 100 > ${var.errors_rate_thresold_critical}
|
) * 100 > ${var.errors_rate_thresold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -4,12 +4,12 @@ resource "datadog_monitor" "function_http_5xx_errors_rate" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.http_5xx_errors_rate_message, var.message)}"
|
message = "${coalesce(var.http_5xx_errors_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.http_5xx_errors_rate_time_aggregator}(${var.http_5xx_errors_rate_timeframe}): default(
|
${var.http_5xx_errors_rate_time_aggregator}(${var.http_5xx_errors_rate_timeframe}): default(
|
||||||
default(avg:azure.functions.http5xx${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
default(avg:azure.functions.http5xx${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
default(avg:azure.functions.function_execution_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
default(avg:azure.functions.function_execution_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
, 0) * 100 > ${var.http_5xx_errors_rate_threshold_critical}
|
, 0) * 100 > ${var.http_5xx_errors_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -36,11 +36,11 @@ resource "datadog_monitor" "function_high_connections_count" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.high_connections_count_message, var.message)}"
|
message = "${coalesce(var.high_connections_count_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.high_connections_count_time_aggregator}(${var.high_connections_count_timeframe}):
|
${var.high_connections_count_time_aggregator}(${var.high_connections_count_timeframe}):
|
||||||
default(azure.functions.connections${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0)
|
default(azure.functions.connections${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0)
|
||||||
> ${var.high_connections_count_threshold_critical}
|
> ${var.high_connections_count_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -67,11 +67,11 @@ resource "datadog_monitor" "function_high_threads_count" {
|
|||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
message = "${coalesce(var.high_threads_count_message, var.message)}"
|
message = "${coalesce(var.high_threads_count_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.high_threads_count_time_aggregator}(${var.high_threads_count_timeframe}):
|
${var.high_threads_count_time_aggregator}(${var.high_threads_count_timeframe}):
|
||||||
default(azure.functions.thread_count${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0)
|
default(azure.functions.thread_count${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0)
|
||||||
> ${var.high_threads_count_threshold_critical}
|
> ${var.high_threads_count_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
|
|||||||
@ -3,14 +3,14 @@ resource "datadog_monitor" "too_many_jobs_failed" {
|
|||||||
name = "[${var.environment}] IOT Hub Too many jobs failed {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] IOT Hub Too many jobs failed {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.failed_jobs_rate_message, var.message)}"
|
message = "${coalesce(var.failed_jobs_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.failed_jobs_rate_time_aggregator}(${var.failed_jobs_rate_timeframe}):
|
${var.failed_jobs_rate_time_aggregator}(${var.failed_jobs_rate_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.jobs.failed${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.jobs.failed${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
default(avg:azure.devices_iothubs.jobs.failed${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
default(avg:azure.devices_iothubs.jobs.failed${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
default(avg:azure.devices_iothubs.jobs.completed${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
default(avg:azure.devices_iothubs.jobs.completed${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
* 100 , 0) > ${var.failed_jobs_rate_threshold_critical}
|
* 100 , 0) > ${var.failed_jobs_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -39,14 +39,14 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
|
|||||||
name = "[${var.environment}] IOT Hub Too many list_jobs failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] IOT Hub Too many list_jobs failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.failed_listjobs_rate_message, var.message)}"
|
message = "${coalesce(var.failed_listjobs_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.failed_listjobs_rate_time_aggregator}(${var.failed_listjobs_rate_timeframe}):
|
${var.failed_listjobs_rate_time_aggregator}(${var.failed_listjobs_rate_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.jobs.list_jobs.failure${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.jobs.list_jobs.failure${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) / (
|
||||||
default(avg:azure.devices_iothubs.jobs.list_jobs.success${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) +
|
default(avg:azure.devices_iothubs.jobs.list_jobs.success${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) +
|
||||||
default(avg:azure.devices_iothubs.jobs.list_jobs.failure${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) )
|
default(avg:azure.devices_iothubs.jobs.list_jobs.failure${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) )
|
||||||
* 100, 0) > ${var.failed_listjobs_rate_threshold_critical}
|
* 100, 0) > ${var.failed_listjobs_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -75,14 +75,14 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
|
|||||||
name = "[${var.environment}] IOT Hub Too many query_jobs failed {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] IOT Hub Too many query_jobs failed {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.failed_queryjobs_rate_message, var.message)}"
|
message = "${coalesce(var.failed_queryjobs_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.failed_queryjobs_rate_time_aggregator}(${var.failed_queryjobs_rate_timeframe}):
|
${var.failed_queryjobs_rate_time_aggregator}(${var.failed_queryjobs_rate_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.jobs.query_jobs.failure${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.jobs.query_jobs.failure${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) / (
|
||||||
default(avg:azure.devices_iothubs.jobs.query_jobs.success${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) +
|
default(avg:azure.devices_iothubs.jobs.query_jobs.success${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) +
|
||||||
default(avg:azure.devices_iothubs.jobs.query_jobs.failure${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) )
|
default(avg:azure.devices_iothubs.jobs.query_jobs.failure${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) )
|
||||||
* 100, 0) > ${var.failed_queryjobs_rate_threshold_critical}
|
* 100, 0) > ${var.failed_queryjobs_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -111,11 +111,11 @@ resource "datadog_monitor" "status" {
|
|||||||
name = "[${var.environment}] IOT Hub is down"
|
name = "[${var.environment}] IOT Hub is down"
|
||||||
message = "${coalesce(var.status_message, var.message)}"
|
message = "${coalesce(var.status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.status_time_aggregator}(${var.status_timeframe}): (
|
${var.status_time_aggregator}(${var.status_timeframe}): (
|
||||||
avg:azure.devices_iothubs.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.devices_iothubs.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) < 1
|
) < 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -139,11 +139,11 @@ resource "datadog_monitor" "total_devices" {
|
|||||||
name = "[${var.environment}] IOT Hub Total devices is wrong {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
name = "[${var.environment}] IOT Hub Total devices is wrong {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||||
message = "${coalesce(var.total_devices_message, var.message)}"
|
message = "${coalesce(var.total_devices_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.total_devices_time_aggregator}(${var.total_devices_timeframe}): (
|
${var.total_devices_time_aggregator}(${var.total_devices_timeframe}): (
|
||||||
avg:azure.devices_iothubs.devices.total_devices${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.devices_iothubs.devices.total_devices${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) == 0
|
) == 0
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -167,14 +167,14 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
|
|||||||
name = "[${var.environment}] IOT Hub Too many c2d methods failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] IOT Hub Too many c2d methods failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.failed_c2d_methods_rate_message, var.message)}"
|
message = "${coalesce(var.failed_c2d_methods_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.failed_c2d_methods_rate_time_aggregator}(${var.failed_c2d_methods_rate_timeframe}):
|
${var.failed_c2d_methods_rate_time_aggregator}(${var.failed_c2d_methods_rate_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.c2d.methods.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.c2d.methods.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
default(avg:azure.devices_iothubs.c2d.methods.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
default(avg:azure.devices_iothubs.c2d.methods.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
default(avg:azure.devices_iothubs.c2d.methods.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
default(avg:azure.devices_iothubs.c2d.methods.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
* 100, 0) > ${var.failed_c2d_methods_rate_threshold_critical}
|
* 100, 0) > ${var.failed_c2d_methods_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -203,14 +203,14 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
|
|||||||
name = "[${var.environment}] IOT Hub Too many c2d twin read failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] IOT Hub Too many c2d twin read failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.failed_c2d_twin_read_rate_message, var.message)}"
|
message = "${coalesce(var.failed_c2d_twin_read_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.failed_c2d_twin_read_rate_time_aggregator}(${var.failed_c2d_twin_read_rate_timeframe}):
|
${var.failed_c2d_twin_read_rate_time_aggregator}(${var.failed_c2d_twin_read_rate_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.c2d.twin.read.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.c2d.twin.read.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
default(avg:azure.devices_iothubs.c2d.twin.read.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
default(avg:azure.devices_iothubs.c2d.twin.read.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
default(avg:azure.devices_iothubs.c2d.twin.read.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
default(avg:azure.devices_iothubs.c2d.twin.read.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
* 100, 0) > ${var.failed_c2d_twin_read_rate_threshold_critical}
|
* 100, 0) > ${var.failed_c2d_twin_read_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -239,14 +239,14 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
|
|||||||
name = "[${var.environment}] IOT Hub Too many c2d twin update failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] IOT Hub Too many c2d twin update failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.failed_c2d_twin_update_rate_message, var.message)}"
|
message = "${coalesce(var.failed_c2d_twin_update_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.failed_c2d_twin_update_rate_time_aggregator}(${var.failed_c2d_twin_update_rate_timeframe}):
|
${var.failed_c2d_twin_update_rate_time_aggregator}(${var.failed_c2d_twin_update_rate_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.c2d.twin.update.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.c2d.twin.update.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
default(avg:azure.devices_iothubs.c2d.twin.update.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
default(avg:azure.devices_iothubs.c2d.twin.update.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
default(avg:azure.devices_iothubs.c2d.twin.update.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
default(avg:azure.devices_iothubs.c2d.twin.update.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
* 100, 0) > ${var.failed_c2d_twin_update_rate_threshold_critical}
|
* 100, 0) > ${var.failed_c2d_twin_update_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -275,14 +275,14 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
|
|||||||
name = "[${var.environment}] IOT Hub Too many d2c twin read failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] IOT Hub Too many d2c twin read failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.failed_d2c_twin_read_rate_message, var.message)}"
|
message = "${coalesce(var.failed_d2c_twin_read_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.failed_d2c_twin_read_rate_time_aggregator}(${var.failed_d2c_twin_read_rate_timeframe}):
|
${var.failed_d2c_twin_read_rate_time_aggregator}(${var.failed_d2c_twin_read_rate_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.d2c.twin.read.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.d2c.twin.read.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
default(avg:azure.devices_iothubs.d2c.twin.read.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
default(avg:azure.devices_iothubs.d2c.twin.read.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
default(avg:azure.devices_iothubs.d2c.twin.read.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
default(avg:azure.devices_iothubs.d2c.twin.read.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
* 100, 0) > ${var.failed_d2c_twin_read_rate_threshold_critical}
|
* 100, 0) > ${var.failed_d2c_twin_read_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -311,14 +311,14 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
|
|||||||
name = "[${var.environment}] IOT Hub Too many d2c twin update failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] IOT Hub Too many d2c twin update failure {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.failed_d2c_twin_update_rate_message, var.message)}"
|
message = "${coalesce(var.failed_d2c_twin_update_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.failed_d2c_twin_update_rate_time_aggregator}(${var.failed_d2c_twin_update_rate_timeframe}):
|
${var.failed_d2c_twin_update_rate_time_aggregator}(${var.failed_d2c_twin_update_rate_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.d2c.twin.update.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.d2c.twin.update.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
default(avg:azure.devices_iothubs.d2c.twin.update.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
default(avg:azure.devices_iothubs.d2c.twin.update.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
default(avg:azure.devices_iothubs.d2c.twin.update.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
default(avg:azure.devices_iothubs.d2c.twin.update.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
* 100, 0) > ${var.failed_d2c_twin_update_rate_threshold_critical}
|
* 100, 0) > ${var.failed_d2c_twin_update_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -347,7 +347,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
|
|||||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||||
message = "${coalesce(var.dropped_d2c_telemetry_egress_message, var.message)}"
|
message = "${coalesce(var.dropped_d2c_telemetry_egress_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.dropped_d2c_telemetry_egress_time_aggregator}(${var.dropped_d2c_telemetry_egress_timeframe}):
|
${var.dropped_d2c_telemetry_egress_time_aggregator}(${var.dropped_d2c_telemetry_egress_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
@ -356,7 +356,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
|
|||||||
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
default(avg:azure.devices_iothubs.d2c.telemetry.egress.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
* 100, 0) > ${var.dropped_d2c_telemetry_egress_rate_threshold_critical}
|
* 100, 0) > ${var.dropped_d2c_telemetry_egress_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -385,7 +385,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
|
|||||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||||
message = "${coalesce(var.orphaned_d2c_telemetry_egress_message, var.message)}"
|
message = "${coalesce(var.orphaned_d2c_telemetry_egress_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.orphaned_d2c_telemetry_egress_time_aggregator}(${var.orphaned_d2c_telemetry_egress_timeframe}):
|
${var.orphaned_d2c_telemetry_egress_time_aggregator}(${var.orphaned_d2c_telemetry_egress_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
@ -394,7 +394,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
|
|||||||
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
default(avg:azure.devices_iothubs.d2c.telemetry.egress.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
* 100, 0) > ${var.orphaned_d2c_telemetry_egress_rate_threshold_critical}
|
* 100, 0) > ${var.orphaned_d2c_telemetry_egress_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -423,7 +423,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
|
|||||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||||
message = "${coalesce(var.invalid_d2c_telemetry_egress_message, var.message)}"
|
message = "${coalesce(var.invalid_d2c_telemetry_egress_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.invalid_d2c_telemetry_egress_time_aggregator}(${var.invalid_d2c_telemetry_egress_timeframe}):
|
${var.invalid_d2c_telemetry_egress_time_aggregator}(${var.invalid_d2c_telemetry_egress_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
@ -432,7 +432,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
|
|||||||
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
default(avg:azure.devices_iothubs.d2c.telemetry.egress.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
* 100, 0) > ${var.invalid_d2c_telemetry_egress_rate_threshold_critical}
|
* 100, 0) > ${var.invalid_d2c_telemetry_egress_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -461,13 +461,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
|
|||||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress not sent {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress not sent {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||||
message = "${coalesce(var.too_many_d2c_telemetry_ingress_nosent_message, var.message)}"
|
message = "${coalesce(var.too_many_d2c_telemetry_ingress_nosent_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
sum(${var.too_many_d2c_telemetry_ingress_nosent_timeframe}):
|
sum(${var.too_many_d2c_telemetry_ingress_nosent_timeframe}):
|
||||||
default(
|
default(
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() -
|
avg:azure.devices_iothubs.d2c.telemetry.ingress.all_protocol${module.filter-tags.query_alert} by {resource_group,region,name}.as_count() -
|
||||||
avg:azure.devices_iothubs.d2c.telemetry.ingress.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_count()
|
avg:azure.devices_iothubs.d2c.telemetry.ingress.success${module.filter-tags.query_alert} by {resource_group,region,name}.as_count()
|
||||||
, 0) > 0
|
, 0) > 0
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -4,11 +4,11 @@ resource "datadog_monitor" "keyvault_status" {
|
|||||||
name = "[${var.environment}] Key Vault is down"
|
name = "[${var.environment}] Key Vault is down"
|
||||||
message = "${coalesce(var.status_message, var.message)}"
|
message = "${coalesce(var.status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.status_time_aggregator}(${var.status_timeframe}): (
|
${var.status_time_aggregator}(${var.status_timeframe}): (
|
||||||
avg:azure.keyvault_vaults.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.keyvault_vaults.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) < 1
|
) < 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -33,14 +33,14 @@ resource "datadog_monitor" "keyvault_api_result" {
|
|||||||
name = "[${var.environment}] Key Vault API result rate is low {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Key Vault API result rate is low {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.status_message, var.message)}"
|
message = "${coalesce(var.status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.api_result_time_aggregator}(${var.api_result_timeframe}):
|
${var.api_result_time_aggregator}(${var.api_result_timeframe}):
|
||||||
default(
|
default(
|
||||||
avg:azure.keyvault_vaults.service_api_result${format(module.filter-tags-statuscode.query_alert, "200")} by {name,resource_group,region}.as_rate() /
|
avg:azure.keyvault_vaults.service_api_result${format(module.filter-tags-statuscode.query_alert, "200")} by {name,resource_group,region}.as_rate() /
|
||||||
avg:azure.keyvault_vaults.service_api_result${module.filter-tags.query_alert} by {name,resource_group,region}.as_rate()
|
avg:azure.keyvault_vaults.service_api_result${module.filter-tags.query_alert} by {name,resource_group,region}.as_rate()
|
||||||
* 100
|
* 100
|
||||||
, 100) < ${var.api_result_threshold_critical}
|
, 100) < ${var.api_result_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.api_result_threshold_critical}"
|
critical = "${var.api_result_threshold_critical}"
|
||||||
@ -70,11 +70,11 @@ resource "datadog_monitor" "keyvault_api_latency" {
|
|||||||
name = "[${var.environment}] Key Vault API latency is high {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
|
name = "[${var.environment}] Key Vault API latency is high {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
|
||||||
message = "${coalesce(var.status_message, var.message)}"
|
message = "${coalesce(var.status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.api_latency_time_aggregator}(${var.api_latency_timeframe}):
|
${var.api_latency_time_aggregator}(${var.api_latency_timeframe}):
|
||||||
avg:azure.keyvault_vaults.service_api_latency${module.filter-tags-activity.query_alert} by {name,resource_group,region}
|
avg:azure.keyvault_vaults.service_api_latency${module.filter-tags-activity.query_alert} by {name,resource_group,region}
|
||||||
> ${var.api_latency_threshold_critical}
|
> ${var.api_latency_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.api_latency_threshold_critical}"
|
critical = "${var.api_latency_threshold_critical}"
|
||||||
|
|||||||
@ -4,11 +4,11 @@ resource "datadog_monitor" "loadbalancer_status" {
|
|||||||
name = "[${var.environment}] Load Balancer is unreachable"
|
name = "[${var.environment}] Load Balancer is unreachable"
|
||||||
message = "${coalesce(var.status_message, var.message)}"
|
message = "${coalesce(var.status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.status_time_aggregator}(${var.status_timeframe}): (
|
${var.status_time_aggregator}(${var.status_timeframe}): (
|
||||||
avg:azure.network_loadbalancers.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.network_loadbalancers.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) < 1
|
) < 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -3,11 +3,11 @@ resource "datadog_monitor" "mysql_cpu_usage" {
|
|||||||
name = "[${var.environment}] Mysql Server CPU usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Mysql Server CPU usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.cpu_usage_message, var.message)}"
|
message = "${coalesce(var.cpu_usage_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cpu_usage_time_aggregator}(${var.cpu_usage_timeframe}): (
|
${var.cpu_usage_time_aggregator}(${var.cpu_usage_timeframe}): (
|
||||||
avg:azure.dbformysql_servers.cpu_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.dbformysql_servers.cpu_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.cpu_usage_threshold_critical}
|
) > ${var.cpu_usage_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -36,11 +36,11 @@ resource "datadog_monitor" "mysql_free_storage" {
|
|||||||
name = "[${var.environment}] Mysql Server storage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Mysql Server storage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.free_storage_message, var.message)}"
|
message = "${coalesce(var.free_storage_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.free_storage_time_aggregator}(${var.free_storage_timeframe}): (
|
${var.free_storage_time_aggregator}(${var.free_storage_timeframe}): (
|
||||||
100 - avg:azure.dbformysql_servers.storage_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
100 - avg:azure.dbformysql_servers.storage_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) < ${var.free_storage_threshold_critical}
|
) < ${var.free_storage_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -69,11 +69,11 @@ resource "datadog_monitor" "mysql_io_consumption" {
|
|||||||
name = "[${var.environment}] Mysql Server IO consumption {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Mysql Server IO consumption {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.io_consumption_message, var.message)}"
|
message = "${coalesce(var.io_consumption_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.io_consumption_time_aggregator}(${var.io_consumption_timeframe}): (
|
${var.io_consumption_time_aggregator}(${var.io_consumption_timeframe}): (
|
||||||
avg:azure.dbformysql_servers.io_consumption_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.dbformysql_servers.io_consumption_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.io_consumption_threshold_critical}
|
) > ${var.io_consumption_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -102,11 +102,11 @@ resource "datadog_monitor" "mysql_memory_usage" {
|
|||||||
name = "[${var.environment}] Mysql Server memory usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Mysql Server memory usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.memory_usage_message, var.message)}"
|
message = "${coalesce(var.memory_usage_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.memory_usage_time_aggregator}(${var.memory_usage_timeframe}): (
|
${var.memory_usage_time_aggregator}(${var.memory_usage_timeframe}): (
|
||||||
avg:azure.dbformysql_servers.memory_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.dbformysql_servers.memory_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.memory_usage_threshold_critical}
|
) > ${var.memory_usage_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -3,11 +3,11 @@ resource "datadog_monitor" "postgresql_cpu_usage" {
|
|||||||
name = "[${var.environment}] Postgresql Server CPU usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Postgresql Server CPU usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.cpu_usage_message, var.message)}"
|
message = "${coalesce(var.cpu_usage_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cpu_usage_time_aggregator}(${var.cpu_usage_timeframe}): (
|
${var.cpu_usage_time_aggregator}(${var.cpu_usage_timeframe}): (
|
||||||
avg:azure.dbforpostgresql_servers.cpu_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.dbforpostgresql_servers.cpu_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.cpu_usage_threshold_critical}
|
) > ${var.cpu_usage_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -36,11 +36,11 @@ resource "datadog_monitor" "postgresql_no_connection" {
|
|||||||
name = "[${var.environment}] Postgresql Server has no connection"
|
name = "[${var.environment}] Postgresql Server has no connection"
|
||||||
message = "${coalesce(var.no_connection_message, var.message)}"
|
message = "${coalesce(var.no_connection_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.no_connection_time_aggregator}(${var.no_connection_timeframe}): (
|
${var.no_connection_time_aggregator}(${var.no_connection_timeframe}): (
|
||||||
avg:azure.dbforpostgresql_servers.active_connections${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.dbforpostgresql_servers.active_connections${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) < 1
|
) < 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -64,11 +64,11 @@ resource "datadog_monitor" "postgresql_free_storage" {
|
|||||||
name = "[${var.environment}] Postgresql Server storage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Postgresql Server storage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.free_storage_message, var.message)}"
|
message = "${coalesce(var.free_storage_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.free_storage_time_aggregator}(${var.free_storage_timeframe}): (
|
${var.free_storage_time_aggregator}(${var.free_storage_timeframe}): (
|
||||||
100 - avg:azure.dbforpostgresql_servers.storage_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
100 - avg:azure.dbforpostgresql_servers.storage_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) < ${var.free_storage_threshold_critical}
|
) < ${var.free_storage_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -97,11 +97,11 @@ resource "datadog_monitor" "postgresql_io_consumption" {
|
|||||||
name = "[${var.environment}] Postgresql Server IO consumption {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Postgresql Server IO consumption {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.io_consumption_message, var.message)}"
|
message = "${coalesce(var.io_consumption_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.io_consumption_time_aggregator}(${var.io_consumption_timeframe}): (
|
${var.io_consumption_time_aggregator}(${var.io_consumption_timeframe}): (
|
||||||
avg:azure.dbforpostgresql_servers.io_consumption_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.dbforpostgresql_servers.io_consumption_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.io_consumption_threshold_critical}
|
) > ${var.io_consumption_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -130,11 +130,11 @@ resource "datadog_monitor" "postgresql_memory_usage" {
|
|||||||
name = "[${var.environment}] Postgresql Server memory usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Postgresql Server memory usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.memory_usage_message, var.message)}"
|
message = "${coalesce(var.memory_usage_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.memory_usage_time_aggregator}(${var.memory_usage_timeframe}): (
|
${var.memory_usage_time_aggregator}(${var.memory_usage_timeframe}): (
|
||||||
avg:azure.dbforpostgresql_servers.memory_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.dbforpostgresql_servers.memory_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.memory_usage_threshold_critical}
|
) > ${var.memory_usage_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -3,11 +3,11 @@ resource "datadog_monitor" "status" {
|
|||||||
name = "[${var.environment}] Redis {{name}} is down"
|
name = "[${var.environment}] Redis {{name}} is down"
|
||||||
message = "${coalesce(var.status_message, var.message)}"
|
message = "${coalesce(var.status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.status_time_aggregator}(${var.status_timeframe}): (
|
${var.status_time_aggregator}(${var.status_timeframe}): (
|
||||||
avg:azure.cache_redis.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.cache_redis.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) != 1
|
) != 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -31,11 +31,11 @@ resource "datadog_monitor" "evictedkeys" {
|
|||||||
name = "[${var.environment}] Redis too many evictedkeys {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
name = "[${var.environment}] Redis too many evictedkeys {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||||
message = "${coalesce(var.evictedkeys_limit_message, var.message)}"
|
message = "${coalesce(var.evictedkeys_limit_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.evictedkeys_limit_time_aggregator}(${var.evictedkeys_limit_timeframe}): (
|
${var.evictedkeys_limit_time_aggregator}(${var.evictedkeys_limit_timeframe}): (
|
||||||
avg:azure.cache_redis.evictedkeys${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.cache_redis.evictedkeys${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.evictedkeys_limit_threshold_critical}
|
) > ${var.evictedkeys_limit_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -64,11 +64,11 @@ resource "datadog_monitor" "percent_processor_time" {
|
|||||||
name = "[${var.environment}] Redis processor time too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Redis processor time too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.percent_processor_time_message, var.message)}"
|
message = "${coalesce(var.percent_processor_time_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.percent_processor_time_time_aggregator}(${var.percent_processor_time_timeframe}): (
|
${var.percent_processor_time_time_aggregator}(${var.percent_processor_time_timeframe}): (
|
||||||
avg:azure.cache_redis.percent_processor_time${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.cache_redis.percent_processor_time${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.percent_processor_time_threshold_critical}
|
) > ${var.percent_processor_time_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -97,11 +97,11 @@ resource "datadog_monitor" "server_load" {
|
|||||||
name = "[${var.environment}] Redis server load too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Redis server load too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.server_load_rate_message, var.message)}"
|
message = "${coalesce(var.server_load_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.server_load_rate_time_aggregator}(${var.server_load_rate_timeframe}): (
|
${var.server_load_rate_time_aggregator}(${var.server_load_rate_timeframe}): (
|
||||||
avg:azure.cache_redis.server_load${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.cache_redis.server_load${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.server_load_rate_threshold_critical}
|
) > ${var.server_load_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -3,11 +3,11 @@ resource "datadog_monitor" "status" {
|
|||||||
name = "[${var.environment}] Serverfarm is down"
|
name = "[${var.environment}] Serverfarm is down"
|
||||||
message = "${coalesce(var.status_message, var.message)}"
|
message = "${coalesce(var.status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.status_time_aggregator}(${var.status_timeframe}): (
|
${var.status_time_aggregator}(${var.status_timeframe}): (
|
||||||
avg:azure.web_serverfarms.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.web_serverfarms.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) != 1
|
) != 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -31,11 +31,11 @@ resource "datadog_monitor" "cpu_percentage" {
|
|||||||
name = "[${var.environment}] Serverfarm CPU percentage is too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Serverfarm CPU percentage is too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.cpu_percentage_message, var.message)}"
|
message = "${coalesce(var.cpu_percentage_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cpu_percentage_time_aggregator}(${var.cpu_percentage_timeframe}): (
|
${var.cpu_percentage_time_aggregator}(${var.cpu_percentage_timeframe}): (
|
||||||
avg:azure.web_serverfarms.cpu_percentage${module.filter-tags.query_alert} by {resource_group,region,name,instance}
|
avg:azure.web_serverfarms.cpu_percentage${module.filter-tags.query_alert} by {resource_group,region,name,instance}
|
||||||
) > ${var.cpu_percentage_threshold_critical}
|
) > ${var.cpu_percentage_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -64,11 +64,11 @@ resource "datadog_monitor" "memory_percentage" {
|
|||||||
name = "[${var.environment}] Serverfarm memory percentage is too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Serverfarm memory percentage is too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.memory_percentage_message, var.message)}"
|
message = "${coalesce(var.memory_percentage_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.memory_percentage_time_aggregator}(${var.memory_percentage_timeframe}): (
|
${var.memory_percentage_time_aggregator}(${var.memory_percentage_timeframe}): (
|
||||||
avg:azure.web_serverfarms.memory_percentage${module.filter-tags.query_alert} by {resource_group,region,name,instance}
|
avg:azure.web_serverfarms.memory_percentage${module.filter-tags.query_alert} by {resource_group,region,name,instance}
|
||||||
) > ${var.memory_percentage_threshold_critical}
|
) > ${var.memory_percentage_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -4,11 +4,11 @@ resource "datadog_monitor" "servicebus_status" {
|
|||||||
name = "[${var.environment}] Service Bus is down"
|
name = "[${var.environment}] Service Bus is down"
|
||||||
message = "${coalesce(var.status_message, var.message)}"
|
message = "${coalesce(var.status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.status_time_aggregator}(${var.status_timeframe}): (
|
${var.status_time_aggregator}(${var.status_timeframe}): (
|
||||||
avg:azure.servicebus_namespaces.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.servicebus_namespaces.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) != 1
|
) != 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -33,11 +33,11 @@ resource "datadog_monitor" "service_bus_no_active_connections" {
|
|||||||
name = "[${var.environment}] Service Bus has no active connection"
|
name = "[${var.environment}] Service Bus has no active connection"
|
||||||
message = "${coalesce(var.no_active_connections_message, var.message)}"
|
message = "${coalesce(var.no_active_connections_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.no_active_connections_time_aggregator}(${var.no_active_connections_timeframe}): (
|
${var.no_active_connections_time_aggregator}(${var.no_active_connections_timeframe}): (
|
||||||
avg:azure.servicebus_namespaces.active_connections_preview${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.servicebus_namespaces.active_connections_preview${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) < 1
|
) < 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -62,12 +62,12 @@ resource "datadog_monitor" "service_bus_user_errors" {
|
|||||||
name = "[${var.environment}] Service Bus user errors rate is high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Service Bus user errors rate is high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.user_errors_message, var.message)}"
|
message = "${coalesce(var.user_errors_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.user_errors_time_aggregator}(${var.user_errors_timeframe}): (
|
${var.user_errors_time_aggregator}(${var.user_errors_timeframe}): (
|
||||||
default(avg:azure.servicebus_namespaces.user_errors.preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 0) /
|
default(avg:azure.servicebus_namespaces.user_errors.preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 0) /
|
||||||
default(avg:azure.servicebus_namespaces.incoming_requests_preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 1)
|
default(avg:azure.servicebus_namespaces.incoming_requests_preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 1)
|
||||||
) * 100 > ${var.user_errors_threshold_critical}
|
) * 100 > ${var.user_errors_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -97,12 +97,12 @@ resource "datadog_monitor" "service_bus_server_errors" {
|
|||||||
name = "[${var.environment}] Service Bus server errors rate is high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Service Bus server errors rate is high {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.server_errors_message, var.message)}"
|
message = "${coalesce(var.server_errors_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.server_errors_time_aggregator}(${var.server_errors_timeframe}): (
|
${var.server_errors_time_aggregator}(${var.server_errors_timeframe}): (
|
||||||
default(avg:azure.servicebus_namespaces.server_errors.preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 0) /
|
default(avg:azure.servicebus_namespaces.server_errors.preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 0) /
|
||||||
default(avg:azure.servicebus_namespaces.incoming_requests_preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 1)
|
default(avg:azure.servicebus_namespaces.incoming_requests_preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 1)
|
||||||
) * 100 > ${var.server_errors_threshold_critical}
|
) * 100 > ${var.server_errors_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -3,11 +3,11 @@ resource "datadog_monitor" "status" {
|
|||||||
name = "[${var.environment}] SQL Database is down"
|
name = "[${var.environment}] SQL Database is down"
|
||||||
message = "${coalesce(var.status_message, var.message)}"
|
message = "${coalesce(var.status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.status_time_aggregator}(${var.status_timeframe}): (
|
${var.status_time_aggregator}(${var.status_timeframe}): (
|
||||||
avg:azure.sql_servers_databases.status${module.filter-tags.query_alert} by {resource_group,region,server_name,name}
|
avg:azure.sql_servers_databases.status${module.filter-tags.query_alert} by {resource_group,region,server_name,name}
|
||||||
) != 1
|
) != 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -31,11 +31,11 @@ resource "datadog_monitor" "sql-database_cpu_90_15min" {
|
|||||||
name = "[${var.environment}] SQL Database CPU too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] SQL Database CPU too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.cpu_message, var.message)}"
|
message = "${coalesce(var.cpu_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cpu_time_aggregator}(${var.cpu_timeframe}): (
|
${var.cpu_time_aggregator}(${var.cpu_timeframe}): (
|
||||||
avg:azure.sql_servers_databases.cpu_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.sql_servers_databases.cpu_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.cpu_threshold_critical}
|
) > ${var.cpu_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -65,11 +65,11 @@ resource "datadog_monitor" "sql-database_free_space_low" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.diskspace_time_aggregator}(${var.diskspace_timeframe}): (
|
${var.diskspace_time_aggregator}(${var.diskspace_timeframe}): (
|
||||||
avg:azure.sql_servers_databases.storage_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.sql_servers_databases.storage_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.diskspace_threshold_critical}
|
) > ${var.diskspace_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.diskspace_threshold_warning}"
|
warning = "${var.diskspace_threshold_warning}"
|
||||||
@ -98,11 +98,11 @@ resource "datadog_monitor" "sql-database_dtu_consumption_high" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.dtu_time_aggregator}(${var.dtu_timeframe}): (
|
${var.dtu_time_aggregator}(${var.dtu_timeframe}): (
|
||||||
azure.sql_servers_databases.dtu_consumption_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
azure.sql_servers_databases.dtu_consumption_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.dtu_threshold_critical}
|
) > ${var.dtu_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.dtu_threshold_warning}"
|
warning = "${var.dtu_threshold_warning}"
|
||||||
@ -131,11 +131,11 @@ resource "datadog_monitor" "sql-database_deadlocks_count" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
sum(${var.deadlock_timeframe}): (
|
sum(${var.deadlock_timeframe}): (
|
||||||
avg:azure.sql_servers_databases.deadlock${module.filter-tags.query_alert} by {resource_group,region,name}.as_count()
|
avg:azure.sql_servers_databases.deadlock${module.filter-tags.query_alert} by {resource_group,region,name}.as_count()
|
||||||
) > ${var.deadlock_threshold_critical}
|
) > ${var.deadlock_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.deadlock_threshold_critical}"
|
critical = "${var.deadlock_threshold_critical}"
|
||||||
|
|||||||
@ -3,11 +3,11 @@ resource "datadog_monitor" "availability" {
|
|||||||
name = "[${var.environment}] Azure Storage is down"
|
name = "[${var.environment}] Azure Storage is down"
|
||||||
message = "${coalesce(var.availability_message, var.message)}"
|
message = "${coalesce(var.availability_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.availability_time_aggregator}(${var.availability_timeframe}): (default(
|
${var.availability_time_aggregator}(${var.availability_timeframe}): (default(
|
||||||
avg:azure.storage.availability${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
avg:azure.storage.availability${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
||||||
100)) < ${var.availability_threshold_critical}
|
100)) < ${var.availability_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.availability_threshold_critical}"
|
critical = "${var.availability_threshold_critical}"
|
||||||
@ -35,11 +35,11 @@ resource "datadog_monitor" "successful_requests" {
|
|||||||
name = "[${var.environment}] Azure Storage too few successful requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Azure Storage too few successful requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.successful_requests_message, var.message)}"
|
message = "${coalesce(var.successful_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.successful_requests_time_aggregator}(${var.successful_requests_timeframe}): (default(
|
${var.successful_requests_time_aggregator}(${var.successful_requests_timeframe}): (default(
|
||||||
avg:azure.storage.percent_success${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
avg:azure.storage.percent_success${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
||||||
100)) < ${var.successful_requests_threshold_critical}
|
100)) < ${var.successful_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.successful_requests_threshold_critical}"
|
critical = "${var.successful_requests_threshold_critical}"
|
||||||
@ -67,11 +67,11 @@ resource "datadog_monitor" "latency" {
|
|||||||
name = "[${var.environment}] Azure Storage too high end to end latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
|
name = "[${var.environment}] Azure Storage too high end to end latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
|
||||||
message = "${coalesce(var.latency_message, var.message)}"
|
message = "${coalesce(var.latency_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.latency_time_aggregator}(${var.latency_timeframe}): (default(
|
${var.latency_time_aggregator}(${var.latency_timeframe}): (default(
|
||||||
avg:azure.storage.average_e2_e_latency${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
avg:azure.storage.average_e2_e_latency${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
||||||
0)) > ${var.latency_threshold_critical}
|
0)) > ${var.latency_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.latency_threshold_critical}"
|
critical = "${var.latency_threshold_critical}"
|
||||||
@ -99,11 +99,11 @@ resource "datadog_monitor" "timeout_error_requests" {
|
|||||||
name = "[${var.environment}] Azure Storage too many timeout errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Azure Storage too many timeout errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.timeout_error_requests_message, var.message)}"
|
message = "${coalesce(var.timeout_error_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.timeout_error_requests_time_aggregator}(${var.timeout_error_requests_timeframe}): (default(
|
${var.timeout_error_requests_time_aggregator}(${var.timeout_error_requests_timeframe}): (default(
|
||||||
avg:azure.storage.percent_timeout_error${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
avg:azure.storage.percent_timeout_error${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
||||||
0)) > ${var.timeout_error_requests_threshold_critical}
|
0)) > ${var.timeout_error_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.timeout_error_requests_threshold_critical}"
|
critical = "${var.timeout_error_requests_threshold_critical}"
|
||||||
@ -131,11 +131,11 @@ resource "datadog_monitor" "network_error_requests" {
|
|||||||
name = "[${var.environment}] Azure Storage too many network errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Azure Storage too many network errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.network_error_requests_message, var.message)}"
|
message = "${coalesce(var.network_error_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.network_error_requests_time_aggregator}(${var.network_error_requests_timeframe}): (default(
|
${var.network_error_requests_time_aggregator}(${var.network_error_requests_timeframe}): (default(
|
||||||
avg:azure.storage.percent_network_error${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
avg:azure.storage.percent_network_error${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
||||||
0)) > ${var.network_error_requests_threshold_critical}
|
0)) > ${var.network_error_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.network_error_requests_threshold_critical}"
|
critical = "${var.network_error_requests_threshold_critical}"
|
||||||
@ -163,11 +163,11 @@ resource "datadog_monitor" "throttling_error_requests" {
|
|||||||
name = "[${var.environment}] Azure Storage too many throttling errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Azure Storage too many throttling errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.throttling_error_requests_message, var.message)}"
|
message = "${coalesce(var.throttling_error_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.throttling_error_requests_time_aggregator}(${var.throttling_error_requests_timeframe}): (default(
|
${var.throttling_error_requests_time_aggregator}(${var.throttling_error_requests_timeframe}): (default(
|
||||||
avg:azure.storage.percent_throttling_error${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
avg:azure.storage.percent_throttling_error${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
||||||
0)) > ${var.throttling_error_requests_threshold_critical}
|
0)) > ${var.throttling_error_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.throttling_error_requests_threshold_critical}"
|
critical = "${var.throttling_error_requests_threshold_critical}"
|
||||||
@ -195,11 +195,11 @@ resource "datadog_monitor" "server_other_error_requests" {
|
|||||||
name = "[${var.environment}] Azure Storage too many server_other errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Azure Storage too many server_other errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.server_other_error_requests_message, var.message)}"
|
message = "${coalesce(var.server_other_error_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.server_other_error_requests_time_aggregator}(${var.server_other_error_requests_timeframe}): (default(
|
${var.server_other_error_requests_time_aggregator}(${var.server_other_error_requests_timeframe}): (default(
|
||||||
avg:azure.storage.percent_server_other_error${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
avg:azure.storage.percent_server_other_error${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
||||||
0)) > ${var.server_other_error_requests_threshold_critical}
|
0)) > ${var.server_other_error_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.server_other_error_requests_threshold_critical}"
|
critical = "${var.server_other_error_requests_threshold_critical}"
|
||||||
@ -227,11 +227,11 @@ resource "datadog_monitor" "client_other_error_requests" {
|
|||||||
name = "[${var.environment}] Azure Storage too many client_other errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Azure Storage too many client_other errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.client_other_error_requests_message, var.message)}"
|
message = "${coalesce(var.client_other_error_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.client_other_error_requests_time_aggregator}(${var.client_other_error_requests_timeframe}): (default(
|
${var.client_other_error_requests_time_aggregator}(${var.client_other_error_requests_timeframe}): (default(
|
||||||
avg:azure.storage.percent_client_other_error${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
avg:azure.storage.percent_client_other_error${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
||||||
0)) > ${var.client_other_error_requests_threshold_critical}
|
0)) > ${var.client_other_error_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.client_other_error_requests_threshold_critical}"
|
critical = "${var.client_other_error_requests_threshold_critical}"
|
||||||
@ -259,11 +259,11 @@ resource "datadog_monitor" "authorization_error_requests" {
|
|||||||
name = "[${var.environment}] Azure Storage too many authorization errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Azure Storage too many authorization errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.authorization_error_requests_message, var.message)}"
|
message = "${coalesce(var.authorization_error_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.authorization_error_requests_time_aggregator}(${var.authorization_error_requests_timeframe}): (default(
|
${var.authorization_error_requests_time_aggregator}(${var.authorization_error_requests_timeframe}): (default(
|
||||||
avg:azure.storage.percent_authorization_error${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
avg:azure.storage.percent_authorization_error${module.filter-tags.query_alert} by {resource_group,storage_type,name},
|
||||||
0)) > ${var.authorization_error_requests_threshold_critical}
|
0)) > ${var.authorization_error_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.authorization_error_requests_threshold_critical}"
|
critical = "${var.authorization_error_requests_threshold_critical}"
|
||||||
|
|||||||
@ -3,11 +3,11 @@ resource "datadog_monitor" "status" {
|
|||||||
name = "[${var.environment}] Stream Analytics is down"
|
name = "[${var.environment}] Stream Analytics is down"
|
||||||
message = "${coalesce(var.status_message, var.message)}"
|
message = "${coalesce(var.status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.status_time_aggregator}(${var.status_timeframe}): (
|
${var.status_time_aggregator}(${var.status_timeframe}): (
|
||||||
avg:azure.streamanalytics_streamingjobs.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.streamanalytics_streamingjobs.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) < 1
|
) < 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -31,11 +31,11 @@ resource "datadog_monitor" "su_utilization" {
|
|||||||
name = "[${var.environment}] Stream Analytics streaming units utilization too high {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
name = "[${var.environment}] Stream Analytics streaming units utilization too high {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||||
message = "${coalesce(var.su_utilization_message, var.message)}"
|
message = "${coalesce(var.su_utilization_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.su_utilization_time_aggregator}(${var.su_utilization_timeframe}): (
|
${var.su_utilization_time_aggregator}(${var.su_utilization_timeframe}): (
|
||||||
avg:azure.streamanalytics_streamingjobs.resource_utilization${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.streamanalytics_streamingjobs.resource_utilization${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.su_utilization_threshold_critical}
|
) > ${var.su_utilization_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -64,12 +64,12 @@ resource "datadog_monitor" "failed_function_requests" {
|
|||||||
name = "[${var.environment}] Stream Analytics too many failed requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Stream Analytics too many failed requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.failed_function_requests_message, var.message)}"
|
message = "${coalesce(var.failed_function_requests_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.failed_function_requests_time_aggregator}(${var.failed_function_requests_timeframe}): (
|
${var.failed_function_requests_time_aggregator}(${var.failed_function_requests_timeframe}): (
|
||||||
default(avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
default(avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
default(avg:azure.streamanalytics_streamingjobs.aml_callout_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
default(avg:azure.streamanalytics_streamingjobs.aml_callout_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
) * 100 > ${var.failed_function_requests_threshold_critical}
|
) * 100 > ${var.failed_function_requests_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -98,11 +98,11 @@ resource "datadog_monitor" "conversion_errors" {
|
|||||||
name = "[${var.environment}] Stream Analytics too many conversion errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Stream Analytics too many conversion errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.conversion_errors_message, var.message)}"
|
message = "${coalesce(var.conversion_errors_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.conversion_errors_time_aggregator}(${var.conversion_errors_timeframe}): (
|
${var.conversion_errors_time_aggregator}(${var.conversion_errors_timeframe}): (
|
||||||
avg:azure.streamanalytics_streamingjobs.conversion_errors${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.streamanalytics_streamingjobs.conversion_errors${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.conversion_errors_threshold_critical}
|
) > ${var.conversion_errors_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -131,11 +131,11 @@ resource "datadog_monitor" "runtime_errors" {
|
|||||||
name = "[${var.environment}] Stream Analytics too many runtime errors {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
name = "[${var.environment}] Stream Analytics too many runtime errors {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||||
message = "${coalesce(var.runtime_errors_message, var.message)}"
|
message = "${coalesce(var.runtime_errors_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.runtime_errors_time_aggregator}(${var.runtime_errors_timeframe}): (
|
${var.runtime_errors_time_aggregator}(${var.runtime_errors_timeframe}): (
|
||||||
avg:azure.streamanalytics_streamingjobs.errors${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.streamanalytics_streamingjobs.errors${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.runtime_errors_threshold_critical}
|
) > ${var.runtime_errors_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -4,11 +4,11 @@ resource "datadog_monitor" "virtualmachine_status" {
|
|||||||
name = "[${var.environment}] Virtual Machine is unreachable"
|
name = "[${var.environment}] Virtual Machine is unreachable"
|
||||||
message = "${coalesce(var.status_message, var.message)}"
|
message = "${coalesce(var.status_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.status_time_aggregator}(${var.status_timeframe}): (
|
${var.status_time_aggregator}(${var.status_timeframe}): (
|
||||||
avg:azure.vm.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.vm.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) < 1
|
) < 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -32,11 +32,11 @@ resource "datadog_monitor" "virtualmachine_cpu_usage" {
|
|||||||
name = "[${var.environment}] Virtual Machine CPU usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Virtual Machine CPU usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.cpu_usage_message, var.message)}"
|
message = "${coalesce(var.cpu_usage_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cpu_usage_time_aggregator}(${var.cpu_usage_timeframe}): (
|
${var.cpu_usage_time_aggregator}(${var.cpu_usage_timeframe}): (
|
||||||
avg:azure.vm.percentage_cpu${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.vm.percentage_cpu${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.cpu_usage_threshold_critical}
|
) > ${var.cpu_usage_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -65,14 +65,14 @@ resource "datadog_monitor" "virtualmachine_credit_cpu_remaining_too_low" {
|
|||||||
name = "[${var.environment}] Virtual Machine credit CPU {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Virtual Machine credit CPU {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.cpu_remaining_rate_message, var.message)}"
|
message = "${coalesce(var.cpu_remaining_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cpu_remaining_rate_time_aggregator}(${var.cpu_remaining_rate_timeframe}):
|
${var.cpu_remaining_rate_time_aggregator}(${var.cpu_remaining_rate_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.vm.cpu_credits_remaining${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 100) / (
|
default(avg:azure.vm.cpu_credits_remaining${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 100) / (
|
||||||
default(avg:azure.vm.cpu_credits_remaining${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 100) +
|
default(avg:azure.vm.cpu_credits_remaining${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 100) +
|
||||||
default(avg:azure.vm.cpu_credits_consumed${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
default(avg:azure.vm.cpu_credits_consumed${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) )
|
||||||
* 100 , 100) < ${var.cpu_remaining_rate_threshold_critical}
|
* 100 , 100) < ${var.cpu_remaining_rate_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -8,10 +8,10 @@ resource "datadog_monitor" "concurrent_queries" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
avg(${var.concurrent_queries_timeframe}):default(avg:gcp.bigquery.query.count{${var.filter_tags}}, 0)
|
avg(${var.concurrent_queries_timeframe}):default(avg:gcp.bigquery.query.count{${var.filter_tags}}, 0)
|
||||||
> ${var.concurrent_queries_threshold_critical}
|
> ${var.concurrent_queries_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.concurrent_queries_threshold_warning}"
|
warning = "${var.concurrent_queries_threshold_warning}"
|
||||||
@ -44,10 +44,10 @@ resource "datadog_monitor" "execution_time" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
avg(${var.execution_time_timeframe}):default(avg:gcp.bigquery.query.execution_times.avg{${var.filter_tags}}, 0)
|
avg(${var.execution_time_timeframe}):default(avg:gcp.bigquery.query.execution_times.avg{${var.filter_tags}}, 0)
|
||||||
> ${var.execution_time_threshold_critical}
|
> ${var.execution_time_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.execution_time_threshold_warning}"
|
warning = "${var.execution_time_threshold_warning}"
|
||||||
@ -80,10 +80,10 @@ resource "datadog_monitor" "scanned_bytes" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
avg(${var.scanned_bytes_timeframe}):default(avg:gcp.bigquery.query.scanned_bytes{${var.filter_tags}}, 0)
|
avg(${var.scanned_bytes_timeframe}):default(avg:gcp.bigquery.query.scanned_bytes{${var.filter_tags}}, 0)
|
||||||
> ${var.scanned_bytes_threshold_critical}
|
> ${var.scanned_bytes_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.scanned_bytes_threshold_warning}"
|
warning = "${var.scanned_bytes_threshold_warning}"
|
||||||
@ -116,10 +116,10 @@ resource "datadog_monitor" "scanned_bytes_billed" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
avg(${var.scanned_bytes_billed_timeframe}):default(avg:gcp.bigquery.query.scanned_bytes_billed{${var.filter_tags}}, 0)
|
avg(${var.scanned_bytes_billed_timeframe}):default(avg:gcp.bigquery.query.scanned_bytes_billed{${var.filter_tags}}, 0)
|
||||||
> ${var.scanned_bytes_billed_threshold_critical}
|
> ${var.scanned_bytes_billed_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.scanned_bytes_billed_threshold_warning}"
|
warning = "${var.scanned_bytes_billed_threshold_warning}"
|
||||||
@ -152,10 +152,10 @@ resource "datadog_monitor" "available_slots" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
avg(${var.available_slots_timeframe}):avg:gcp.bigquery.slots.total_available{${var.filter_tags}}
|
avg(${var.available_slots_timeframe}):avg:gcp.bigquery.slots.total_available{${var.filter_tags}}
|
||||||
< ${var.available_slots_threshold_critical}
|
< ${var.available_slots_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.available_slots_threshold_warning}"
|
warning = "${var.available_slots_threshold_warning}"
|
||||||
@ -188,10 +188,10 @@ resource "datadog_monitor" "stored_bytes" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
avg(${var.stored_bytes_timeframe}):default(avg:gcp.bigquery.storage.stored_bytes{${var.filter_tags}} by {dataset_id,table}, 0)
|
avg(${var.stored_bytes_timeframe}):default(avg:gcp.bigquery.storage.stored_bytes{${var.filter_tags}} by {dataset_id,table}, 0)
|
||||||
> ${var.stored_bytes_threshold_critical}
|
> ${var.stored_bytes_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.stored_bytes_threshold_warning}"
|
warning = "${var.stored_bytes_threshold_warning}"
|
||||||
@ -224,10 +224,10 @@ resource "datadog_monitor" "table_count" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
avg(${var.table_count_timeframe}):avg:gcp.bigquery.storage.table_count{${var.filter_tags}} by {dataset_id}
|
avg(${var.table_count_timeframe}):avg:gcp.bigquery.storage.table_count{${var.filter_tags}} by {dataset_id}
|
||||||
> ${var.table_count_threshold_critical}
|
> ${var.table_count_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.table_count_threshold_warning}"
|
warning = "${var.table_count_threshold_warning}"
|
||||||
@ -260,10 +260,10 @@ resource "datadog_monitor" "uploaded_bytes" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
avg(${var.uploaded_bytes_timeframe}):default(avg:gcp.bigquery.storage.uploaded_bytes{${var.filter_tags}} by {dataset_id,table}, 0)
|
avg(${var.uploaded_bytes_timeframe}):default(avg:gcp.bigquery.storage.uploaded_bytes{${var.filter_tags}} by {dataset_id,table}, 0)
|
||||||
> ${var.uploaded_bytes_threshold_critical}
|
> ${var.uploaded_bytes_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.uploaded_bytes_threshold_warning}"
|
warning = "${var.uploaded_bytes_threshold_warning}"
|
||||||
@ -296,10 +296,10 @@ resource "datadog_monitor" "uploaded_bytes_billed" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
avg(${var.uploaded_bytes_billed_timeframe}):default(avg:gcp.bigquery.storage.uploaded_bytes_billed{${var.filter_tags}} by {dataset_id,table}, 0)
|
avg(${var.uploaded_bytes_billed_timeframe}):default(avg:gcp.bigquery.storage.uploaded_bytes_billed{${var.filter_tags}} by {dataset_id,table}, 0)
|
||||||
> ${var.uploaded_bytes_billed_threshold_critical}
|
> ${var.uploaded_bytes_billed_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.uploaded_bytes_billed_threshold_warning}"
|
warning = "${var.uploaded_bytes_billed_threshold_warning}"
|
||||||
|
|||||||
@ -8,12 +8,12 @@ resource "datadog_monitor" "cpu_utilization" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cpu_utilization_time_aggregator}(${var.cpu_utilization_timeframe}):
|
${var.cpu_utilization_time_aggregator}(${var.cpu_utilization_timeframe}):
|
||||||
avg:gcp.cloudsql.database.cpu.utilization{${var.filter_tags}}
|
avg:gcp.cloudsql.database.cpu.utilization{${var.filter_tags}}
|
||||||
by {database_id} * 100
|
by {database_id} * 100
|
||||||
> ${var.cpu_utilization_threshold_critical}
|
> ${var.cpu_utilization_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.cpu_utilization_threshold_warning}"
|
warning = "${var.cpu_utilization_threshold_warning}"
|
||||||
@ -46,12 +46,12 @@ resource "datadog_monitor" "disk_utilization" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.disk_utilization_time_aggregator}(${var.disk_utilization_timeframe}):
|
${var.disk_utilization_time_aggregator}(${var.disk_utilization_timeframe}):
|
||||||
avg:gcp.cloudsql.database.disk.utilization{${var.filter_tags}}
|
avg:gcp.cloudsql.database.disk.utilization{${var.filter_tags}}
|
||||||
by {database_id} * 100
|
by {database_id} * 100
|
||||||
> ${var.disk_utilization_threshold_critical}
|
> ${var.disk_utilization_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.disk_utilization_threshold_warning}"
|
warning = "${var.disk_utilization_threshold_warning}"
|
||||||
@ -84,7 +84,7 @@ resource "datadog_monitor" "disk_utilization_forecast" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.disk_utilization_forecast_time_aggregator}(${var.disk_utilization_forecast_timeframe}):
|
${var.disk_utilization_forecast_time_aggregator}(${var.disk_utilization_forecast_timeframe}):
|
||||||
forecast(
|
forecast(
|
||||||
avg:gcp.cloudsql.database.disk.utilization{${var.filter_tags}} by {database_id} * 100,
|
avg:gcp.cloudsql.database.disk.utilization{${var.filter_tags}} by {database_id} * 100,
|
||||||
@ -95,7 +95,7 @@ resource "datadog_monitor" "disk_utilization_forecast" {
|
|||||||
${var.disk_utilization_forecast_algorithm == "seasonal" ? format("seasonality='%s'", var.disk_utilization_forecast_seasonal_seasonality): ""}
|
${var.disk_utilization_forecast_algorithm == "seasonal" ? format("seasonality='%s'", var.disk_utilization_forecast_seasonal_seasonality): ""}
|
||||||
)
|
)
|
||||||
>= ${var.disk_utilization_forecast_threshold_critical}
|
>= ${var.disk_utilization_forecast_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.disk_utilization_forecast_threshold_critical}"
|
critical = "${var.disk_utilization_forecast_threshold_critical}"
|
||||||
@ -128,12 +128,12 @@ resource "datadog_monitor" "memory_utilization" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.memory_utilization_time_aggregator}(${var.memory_utilization_timeframe}):
|
${var.memory_utilization_time_aggregator}(${var.memory_utilization_timeframe}):
|
||||||
avg:gcp.cloudsql.database.memory.utilization{${var.filter_tags}}
|
avg:gcp.cloudsql.database.memory.utilization{${var.filter_tags}}
|
||||||
by {database_id} * 100
|
by {database_id} * 100
|
||||||
> ${var.memory_utilization_threshold_critical}
|
> ${var.memory_utilization_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.memory_utilization_threshold_warning}"
|
warning = "${var.memory_utilization_threshold_warning}"
|
||||||
@ -166,7 +166,7 @@ resource "datadog_monitor" "memory_utilization_forecast" {
|
|||||||
|
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.memory_utilization_forecast_time_aggregator}(${var.memory_utilization_forecast_timeframe}):
|
${var.memory_utilization_forecast_time_aggregator}(${var.memory_utilization_forecast_timeframe}):
|
||||||
forecast(
|
forecast(
|
||||||
avg:gcp.cloudsql.database.memory.utilization{${var.filter_tags}} by {database_id} * 100,
|
avg:gcp.cloudsql.database.memory.utilization{${var.filter_tags}} by {database_id} * 100,
|
||||||
@ -177,7 +177,7 @@ resource "datadog_monitor" "memory_utilization_forecast" {
|
|||||||
${var.memory_utilization_forecast_algorithm == "seasonal" ? format("seasonality='%s'", var.memory_utilization_forecast_seasonal_seasonality): ""}
|
${var.memory_utilization_forecast_algorithm == "seasonal" ? format("seasonality='%s'", var.memory_utilization_forecast_seasonal_seasonality): ""}
|
||||||
)
|
)
|
||||||
>= ${var.memory_utilization_forecast_threshold_critical}
|
>= ${var.memory_utilization_forecast_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.memory_utilization_forecast_threshold_critical}"
|
critical = "${var.memory_utilization_forecast_threshold_critical}"
|
||||||
@ -210,12 +210,12 @@ resource "datadog_monitor" "failover_unavailable" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.failover_unavailable_time_aggregator}(${var.failover_unavailable_timeframe}):
|
${var.failover_unavailable_time_aggregator}(${var.failover_unavailable_timeframe}):
|
||||||
avg:gcp.cloudsql.database.available_for_failover{${var.filter_tags}}
|
avg:gcp.cloudsql.database.available_for_failover{${var.filter_tags}}
|
||||||
by {database_id}
|
by {database_id}
|
||||||
<= ${var.failover_unavailable_threshold_critical}
|
<= ${var.failover_unavailable_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.failover_unavailable_threshold_critical}"
|
critical = "${var.failover_unavailable_threshold_critical}"
|
||||||
|
|||||||
@ -8,12 +8,12 @@ resource "datadog_monitor" "replication_lag" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.replication_lag_time_aggregator}(${var.replication_lag_timeframe}):
|
${var.replication_lag_time_aggregator}(${var.replication_lag_timeframe}):
|
||||||
avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{${var.filter_tags}}
|
avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{${var.filter_tags}}
|
||||||
by {database_id}
|
by {database_id}
|
||||||
> ${var.replication_lag_threshold_critical}
|
> ${var.replication_lag_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.replication_lag_threshold_critical}"
|
critical = "${var.replication_lag_threshold_critical}"
|
||||||
|
|||||||
@ -8,11 +8,11 @@ resource "datadog_monitor" "cpu_utilization" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cpu_utilization_time_aggregator}(${var.cpu_utilization_timeframe}):
|
${var.cpu_utilization_time_aggregator}(${var.cpu_utilization_timeframe}):
|
||||||
avg:gcp.gce.instance.cpu.utilization{${var.filter_tags}} by {instance_name} * 100
|
avg:gcp.gce.instance.cpu.utilization{${var.filter_tags}} by {instance_name} * 100
|
||||||
> ${var.cpu_utilization_threshold_critical}
|
> ${var.cpu_utilization_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.cpu_utilization_threshold_warning}"
|
warning = "${var.cpu_utilization_threshold_warning}"
|
||||||
@ -45,7 +45,7 @@ resource "datadog_monitor" "disk_throttled_bps" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.disk_throttled_bps_time_aggregator}(${var.disk_throttled_bps_timeframe}):
|
${var.disk_throttled_bps_time_aggregator}(${var.disk_throttled_bps_timeframe}):
|
||||||
(
|
(
|
||||||
sum:gcp.gce.instance.disk.throttled_read_bytes_count{${var.filter_tags}} by {instance_name, device_name} +
|
sum:gcp.gce.instance.disk.throttled_read_bytes_count{${var.filter_tags}} by {instance_name, device_name} +
|
||||||
@ -55,7 +55,7 @@ resource "datadog_monitor" "disk_throttled_bps" {
|
|||||||
sum:gcp.gce.instance.disk.write_bytes_count{${var.filter_tags}} by {instance_name, device_name}
|
sum:gcp.gce.instance.disk.write_bytes_count{${var.filter_tags}} by {instance_name, device_name}
|
||||||
) * 100
|
) * 100
|
||||||
> ${var.disk_throttled_bps_threshold_critical}
|
> ${var.disk_throttled_bps_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.disk_throttled_bps_threshold_warning}"
|
warning = "${var.disk_throttled_bps_threshold_warning}"
|
||||||
@ -88,7 +88,7 @@ resource "datadog_monitor" "disk_throttled_ops" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.disk_throttled_ops_time_aggregator}(${var.disk_throttled_ops_timeframe}):
|
${var.disk_throttled_ops_time_aggregator}(${var.disk_throttled_ops_timeframe}):
|
||||||
(
|
(
|
||||||
sum:gcp.gce.instance.disk.throttled_read_ops_count{${var.filter_tags}} by {instance_name, device_name} +
|
sum:gcp.gce.instance.disk.throttled_read_ops_count{${var.filter_tags}} by {instance_name, device_name} +
|
||||||
@ -98,7 +98,7 @@ resource "datadog_monitor" "disk_throttled_ops" {
|
|||||||
sum:gcp.gce.instance.disk.write_ops_count{${var.filter_tags}} by {instance_name, device_name}
|
sum:gcp.gce.instance.disk.write_ops_count{${var.filter_tags}} by {instance_name, device_name}
|
||||||
) * 100
|
) * 100
|
||||||
> ${var.disk_throttled_ops_threshold_critical}
|
> ${var.disk_throttled_ops_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.disk_throttled_ops_threshold_warning}"
|
warning = "${var.disk_throttled_ops_threshold_warning}"
|
||||||
|
|||||||
@ -8,12 +8,12 @@ resource "datadog_monitor" "error_rate_4xx" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.error_rate_4xx_time_aggregator}(${var.error_rate_4xx_timeframe}):
|
${var.error_rate_4xx_time_aggregator}(${var.error_rate_4xx_timeframe}):
|
||||||
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags},response_code_class:400} by {forwarding_rule_name}.as_rate(), 0) / (
|
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags},response_code_class:400} by {forwarding_rule_name}.as_rate(), 0) / (
|
||||||
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags}} by {forwarding_rule_name}.as_rate() + ${var.error_rate_4xx_artificial_request}, 1))
|
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags}} by {forwarding_rule_name}.as_rate() + ${var.error_rate_4xx_artificial_request}, 1))
|
||||||
* 100 > ${var.error_rate_4xx_threshold_critical}
|
* 100 > ${var.error_rate_4xx_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.error_rate_4xx_threshold_warning}"
|
warning = "${var.error_rate_4xx_threshold_warning}"
|
||||||
@ -46,12 +46,12 @@ resource "datadog_monitor" "error_rate_5xx" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.error_rate_5xx_time_aggregator}(${var.error_rate_5xx_timeframe}):
|
${var.error_rate_5xx_time_aggregator}(${var.error_rate_5xx_timeframe}):
|
||||||
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags},response_code_class:500} by {forwarding_rule_name}.as_rate(), 0) / (
|
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags},response_code_class:500} by {forwarding_rule_name}.as_rate(), 0) / (
|
||||||
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags}} by {forwarding_rule_name}.as_rate() + ${var.error_rate_5xx_artificial_request}, 1))
|
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags}} by {forwarding_rule_name}.as_rate() + ${var.error_rate_5xx_artificial_request}, 1))
|
||||||
* 100 > ${var.error_rate_5xx_threshold_critical}
|
* 100 > ${var.error_rate_5xx_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.error_rate_5xx_threshold_warning}"
|
warning = "${var.error_rate_5xx_threshold_warning}"
|
||||||
@ -84,11 +84,11 @@ resource "datadog_monitor" "backend_latency_service" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.backend_latency_service_time_aggregator}(${var.backend_latency_service_timeframe}):
|
${var.backend_latency_service_time_aggregator}(${var.backend_latency_service_timeframe}):
|
||||||
default(min:gcp.loadbalancing.https.backend_latencies.avg{${var.filter_tags},backend_target_type:backend_service} by {backend_target_name,forwarding_rule_name}, 0)
|
default(min:gcp.loadbalancing.https.backend_latencies.avg{${var.filter_tags},backend_target_type:backend_service} by {backend_target_name,forwarding_rule_name}, 0)
|
||||||
> ${var.backend_latency_service_threshold_critical}
|
> ${var.backend_latency_service_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.backend_latency_service_threshold_warning}"
|
warning = "${var.backend_latency_service_threshold_warning}"
|
||||||
@ -121,11 +121,11 @@ resource "datadog_monitor" "backend_latency_bucket" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.backend_latency_bucket_time_aggregator}(${var.backend_latency_bucket_timeframe}):
|
${var.backend_latency_bucket_time_aggregator}(${var.backend_latency_bucket_timeframe}):
|
||||||
default(min:gcp.loadbalancing.https.backend_latencies.avg{${var.filter_tags},backend_target_type:backend_bucket} by {backend_target_name,forwarding_rule_name}, 0)
|
default(min:gcp.loadbalancing.https.backend_latencies.avg{${var.filter_tags},backend_target_type:backend_bucket} by {backend_target_name,forwarding_rule_name}, 0)
|
||||||
> ${var.backend_latency_bucket_threshold_critical}
|
> ${var.backend_latency_bucket_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.backend_latency_bucket_threshold_warning}"
|
warning = "${var.backend_latency_bucket_threshold_warning}"
|
||||||
@ -158,11 +158,11 @@ resource "datadog_monitor" "request_count" {
|
|||||||
|
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
pct_change(${var.request_count_time_aggregator}(${var.request_count_timeframe}),${var.request_count_timeshift}):
|
pct_change(${var.request_count_time_aggregator}(${var.request_count_timeframe}),${var.request_count_timeshift}):
|
||||||
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags}} by {forwarding_rule_name}.as_count(), 0)
|
default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags}} by {forwarding_rule_name}.as_count(), 0)
|
||||||
> ${var.request_count_threshold_critical}
|
> ${var.request_count_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.request_count_threshold_warning}"
|
warning = "${var.request_count_threshold_warning}"
|
||||||
|
|||||||
@ -8,11 +8,11 @@ resource "datadog_monitor" "sending_operations_count" {
|
|||||||
|
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.sending_operations_count_time_aggregator}(${var.sending_operations_count_timeframe}):
|
${var.sending_operations_count_time_aggregator}(${var.sending_operations_count_timeframe}):
|
||||||
default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags}} by {topic_id}.as_count(), 0)
|
default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags}} by {topic_id}.as_count(), 0)
|
||||||
<= ${var.sending_operations_count_threshold_critical}
|
<= ${var.sending_operations_count_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.sending_operations_count_threshold_critical}"
|
critical = "${var.sending_operations_count_threshold_critical}"
|
||||||
@ -44,11 +44,11 @@ resource "datadog_monitor" "unavailable_sending_operations_count" {
|
|||||||
|
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.unavailable_sending_operations_count_time_aggregator}(${var.unavailable_sending_operations_count_timeframe}):
|
${var.unavailable_sending_operations_count_time_aggregator}(${var.unavailable_sending_operations_count_timeframe}):
|
||||||
default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags},response_code:unavailable} by {topic_id}.as_count(), 0)
|
default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags},response_code:unavailable} by {topic_id}.as_count(), 0)
|
||||||
>= ${var.unavailable_sending_operations_count_threshold_critical}
|
>= ${var.unavailable_sending_operations_count_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.unavailable_sending_operations_count_threshold_warning}"
|
warning = "${var.unavailable_sending_operations_count_threshold_warning}"
|
||||||
|
|||||||
@ -6,9 +6,9 @@ resource "datadog_monitor" "not_responding" {
|
|||||||
name = "[${var.environment}] ElasticSearch does not respond"
|
name = "[${var.environment}] ElasticSearch does not respond"
|
||||||
message = "${coalesce(var.not_responding_message, var.message)}"
|
message = "${coalesce(var.not_responding_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOL
|
query = <<EOQ
|
||||||
"elasticsearch.can_connect"${module.filter-tags.service_check}.by("server","port").last(6).count_by_status()
|
"elasticsearch.can_connect"${module.filter-tags.service_check}.by("server","port").last(6).count_by_status()
|
||||||
EOL
|
EOQ
|
||||||
|
|
||||||
type = "service check"
|
type = "service check"
|
||||||
|
|
||||||
@ -51,11 +51,11 @@ resource "datadog_monitor" "cluster_status_not_green" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cluster_status_not_green_time_aggregator}(${var.cluster_status_not_green_timeframe}):
|
${var.cluster_status_not_green_time_aggregator}(${var.cluster_status_not_green_timeframe}):
|
||||||
min:elasticsearch.cluster_status${module.filter-tags.query_alert} by {cluster_name}
|
min:elasticsearch.cluster_status${module.filter-tags.query_alert} by {cluster_name}
|
||||||
<= ${var.cluster_status_not_green_threshold_critical}
|
<= ${var.cluster_status_not_green_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
# ok = 2
|
# ok = 2
|
||||||
@ -94,11 +94,11 @@ resource "datadog_monitor" "cluster_initializing_shards" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cluster_initializing_shards_time_aggregator}(${var.cluster_initializing_shards_timeframe}):
|
${var.cluster_initializing_shards_time_aggregator}(${var.cluster_initializing_shards_timeframe}):
|
||||||
avg:elasticsearch.initializing_shards${module.filter-tags.query_alert} by {cluster_name}
|
avg:elasticsearch.initializing_shards${module.filter-tags.query_alert} by {cluster_name}
|
||||||
> ${var.cluster_initializing_shards_threshold_critical}
|
> ${var.cluster_initializing_shards_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.cluster_initializing_shards_threshold_warning}"
|
warning = "${var.cluster_initializing_shards_threshold_warning}"
|
||||||
@ -136,11 +136,11 @@ resource "datadog_monitor" "cluster_relocating_shards" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cluster_relocating_shards_time_aggregator}(${var.cluster_relocating_shards_timeframe}):
|
${var.cluster_relocating_shards_time_aggregator}(${var.cluster_relocating_shards_timeframe}):
|
||||||
avg:elasticsearch.relocating_shards${module.filter-tags.query_alert} by {cluster_name}
|
avg:elasticsearch.relocating_shards${module.filter-tags.query_alert} by {cluster_name}
|
||||||
> ${var.cluster_relocating_shards_threshold_critical}
|
> ${var.cluster_relocating_shards_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.cluster_relocating_shards_threshold_warning}"
|
warning = "${var.cluster_relocating_shards_threshold_warning}"
|
||||||
@ -178,11 +178,11 @@ resource "datadog_monitor" "cluster_unassigned_shards" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cluster_unassigned_shards_time_aggregator}(${var.cluster_unassigned_shards_timeframe}):
|
${var.cluster_unassigned_shards_time_aggregator}(${var.cluster_unassigned_shards_timeframe}):
|
||||||
avg:elasticsearch.unassigned_shards${module.filter-tags.query_alert} by {cluster_name}
|
avg:elasticsearch.unassigned_shards${module.filter-tags.query_alert} by {cluster_name}
|
||||||
> ${var.cluster_unassigned_shards_threshold_critical}
|
> ${var.cluster_unassigned_shards_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.cluster_unassigned_shards_threshold_warning}"
|
warning = "${var.cluster_unassigned_shards_threshold_warning}"
|
||||||
@ -220,14 +220,14 @@ resource "datadog_monitor" "node_free_space" {
|
|||||||
|
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.node_free_space_time_aggregator}(${var.node_free_space_timeframe}):
|
${var.node_free_space_time_aggregator}(${var.node_free_space_timeframe}):
|
||||||
(min:elasticsearch.fs.total.available_in_bytes${module.filter-tags.query_alert} by {node_name}
|
(min:elasticsearch.fs.total.available_in_bytes${module.filter-tags.query_alert} by {node_name}
|
||||||
/
|
/
|
||||||
min:elasticsearch.fs.total.total_in_bytes${module.filter-tags.query_alert} by {node_name}
|
min:elasticsearch.fs.total.total_in_bytes${module.filter-tags.query_alert} by {node_name}
|
||||||
) * 100
|
) * 100
|
||||||
< ${var.node_free_space_threshold_critical}
|
< ${var.node_free_space_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.node_free_space_threshold_warning}"
|
warning = "${var.node_free_space_threshold_warning}"
|
||||||
@ -265,11 +265,11 @@ resource "datadog_monitor" "jvm_heap_memory_usage" {
|
|||||||
|
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.jvm_heap_memory_usage_time_aggregator}(${var.jvm_heap_memory_usage_timeframe}):
|
${var.jvm_heap_memory_usage_time_aggregator}(${var.jvm_heap_memory_usage_timeframe}):
|
||||||
avg:jvm.mem.heap_in_use${module.filter-tags.query_alert} by {node_name}
|
avg:jvm.mem.heap_in_use${module.filter-tags.query_alert} by {node_name}
|
||||||
> ${var.jvm_heap_memory_usage_threshold_critical}
|
> ${var.jvm_heap_memory_usage_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.jvm_heap_memory_usage_threshold_warning}"
|
warning = "${var.jvm_heap_memory_usage_threshold_warning}"
|
||||||
@ -307,11 +307,11 @@ resource "datadog_monitor" "jvm_memory_young_usage" {
|
|||||||
|
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.jvm_memory_young_usage_time_aggregator}(${var.jvm_memory_young_usage_timeframe}):
|
${var.jvm_memory_young_usage_time_aggregator}(${var.jvm_memory_young_usage_timeframe}):
|
||||||
avg:jvm.mem.pools.young.used${module.filter-tags.query_alert} by {node_name} / avg:jvm.mem.pools.young.max${module.filter-tags.query_alert} by {node_name} * 100
|
avg:jvm.mem.pools.young.used${module.filter-tags.query_alert} by {node_name} / avg:jvm.mem.pools.young.max${module.filter-tags.query_alert} by {node_name} * 100
|
||||||
> ${var.jvm_memory_young_usage_threshold_critical}
|
> ${var.jvm_memory_young_usage_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.jvm_memory_young_usage_threshold_warning}"
|
warning = "${var.jvm_memory_young_usage_threshold_warning}"
|
||||||
@ -349,11 +349,11 @@ resource "datadog_monitor" "jvm_memory_old_usage" {
|
|||||||
|
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.jvm_memory_old_usage_time_aggregator}(${var.jvm_memory_old_usage_timeframe}):
|
${var.jvm_memory_old_usage_time_aggregator}(${var.jvm_memory_old_usage_timeframe}):
|
||||||
avg:jvm.mem.pools.old.used${module.filter-tags.query_alert} by {node_name} / avg:jvm.mem.pools.old.max${module.filter-tags.query_alert} by {node_name} * 100
|
avg:jvm.mem.pools.old.used${module.filter-tags.query_alert} by {node_name} / avg:jvm.mem.pools.old.max${module.filter-tags.query_alert} by {node_name} * 100
|
||||||
> ${var.jvm_memory_old_usage_threshold_critical}
|
> ${var.jvm_memory_old_usage_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.jvm_memory_old_usage_threshold_warning}"
|
warning = "${var.jvm_memory_old_usage_threshold_warning}"
|
||||||
@ -391,11 +391,11 @@ resource "datadog_monitor" "jvm_gc_old_collection_latency" {
|
|||||||
|
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.jvm_gc_old_collection_latency_time_aggregator}(${var.jvm_gc_old_collection_latency_timeframe}):
|
${var.jvm_gc_old_collection_latency_time_aggregator}(${var.jvm_gc_old_collection_latency_timeframe}):
|
||||||
avg:jvm.gc.collectors.old.collection_time${module.filter-tags.query_alert} by {node_name} / avg:jvm.gc.collectors.old.count${module.filter-tags.query_alert} by {node_name} * 1000
|
avg:jvm.gc.collectors.old.collection_time${module.filter-tags.query_alert} by {node_name} / avg:jvm.gc.collectors.old.count${module.filter-tags.query_alert} by {node_name} * 1000
|
||||||
> ${var.jvm_gc_old_collection_latency_threshold_critical}
|
> ${var.jvm_gc_old_collection_latency_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.jvm_gc_old_collection_latency_threshold_warning}"
|
warning = "${var.jvm_gc_old_collection_latency_threshold_warning}"
|
||||||
@ -433,11 +433,11 @@ resource "datadog_monitor" "jvm_gc_young_collection_latency" {
|
|||||||
|
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.jvm_gc_young_collection_latency_time_aggregator}(${var.jvm_gc_young_collection_latency_timeframe}):
|
${var.jvm_gc_young_collection_latency_time_aggregator}(${var.jvm_gc_young_collection_latency_timeframe}):
|
||||||
avg:jvm.gc.collectors.young.collection_time${module.filter-tags.query_alert} by {node_name} / avg:jvm.gc.collectors.young.count${module.filter-tags.query_alert} by {node_name} * 1000
|
avg:jvm.gc.collectors.young.collection_time${module.filter-tags.query_alert} by {node_name} / avg:jvm.gc.collectors.young.count${module.filter-tags.query_alert} by {node_name} * 1000
|
||||||
> ${var.jvm_gc_young_collection_latency_threshold_critical}
|
> ${var.jvm_gc_young_collection_latency_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.jvm_gc_young_collection_latency_threshold_warning}"
|
warning = "${var.jvm_gc_young_collection_latency_threshold_warning}"
|
||||||
@ -476,11 +476,11 @@ resource "datadog_monitor" "indexing_latency" {
|
|||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.indexing_latency_time_aggregator}(${var.indexing_latency_timeframe}):
|
${var.indexing_latency_time_aggregator}(${var.indexing_latency_timeframe}):
|
||||||
avg:elasticsearch.indexing.index.time${module.filter-tags.query_alert} by {node_name}/ avg:elasticsearch.indexing.index.total${module.filter-tags.query_alert} by {node_name} * 1000
|
avg:elasticsearch.indexing.index.time${module.filter-tags.query_alert} by {node_name}/ avg:elasticsearch.indexing.index.total${module.filter-tags.query_alert} by {node_name} * 1000
|
||||||
> ${var.indexing_latency_threshold_critical}
|
> ${var.indexing_latency_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.indexing_latency_threshold_warning}"
|
warning = "${var.indexing_latency_threshold_warning}"
|
||||||
@ -519,11 +519,11 @@ resource "datadog_monitor" "flush_latency" {
|
|||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.flush_latency_time_aggregator}(${var.flush_latency_timeframe}):
|
${var.flush_latency_time_aggregator}(${var.flush_latency_timeframe}):
|
||||||
avg:elasticsearch.flush.total.time${module.filter-tags.query_alert} by {node_name} / avg:elasticsearch.flush.total${module.filter-tags.query_alert} by {node_name} * 1000
|
avg:elasticsearch.flush.total.time${module.filter-tags.query_alert} by {node_name} / avg:elasticsearch.flush.total${module.filter-tags.query_alert} by {node_name} * 1000
|
||||||
> ${var.flush_latency_threshold_critical}
|
> ${var.flush_latency_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.flush_latency_threshold_warning}"
|
warning = "${var.flush_latency_threshold_warning}"
|
||||||
@ -561,7 +561,7 @@ resource "datadog_monitor" "http_connections_anomaly" {
|
|||||||
|
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.http_connections_anomaly_time_aggregator}(${var.http_connections_anomaly_timeframe}):
|
${var.http_connections_anomaly_time_aggregator}(${var.http_connections_anomaly_timeframe}):
|
||||||
anomalies(avg:elasticsearch.http.current_open${module.filter-tags.query_alert} by {node_name},
|
anomalies(avg:elasticsearch.http.current_open${module.filter-tags.query_alert} by {node_name},
|
||||||
'${var.http_connections_anomaly_detection_algorithm}',
|
'${var.http_connections_anomaly_detection_algorithm}',
|
||||||
@ -573,7 +573,7 @@ resource "datadog_monitor" "http_connections_anomaly" {
|
|||||||
seasonality='${var.http_connections_anomaly_seasonality}'
|
seasonality='${var.http_connections_anomaly_seasonality}'
|
||||||
)
|
)
|
||||||
>= ${var.http_connections_anomaly_threshold_critical}
|
>= ${var.http_connections_anomaly_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.http_connections_anomaly_threshold_warning}"
|
warning = "${var.http_connections_anomaly_threshold_warning}"
|
||||||
@ -612,11 +612,11 @@ resource "datadog_monitor" "search_query_latency" {
|
|||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.search_query_latency_time_aggregator}(${var.search_query_latency_timeframe}):
|
${var.search_query_latency_time_aggregator}(${var.search_query_latency_timeframe}):
|
||||||
avg:elasticsearch.search.query.time${module.filter-tags.query_alert} by {node_name} / avg:elasticsearch.search.query.total${module.filter-tags.query_alert} by {node_name} * 1000
|
avg:elasticsearch.search.query.time${module.filter-tags.query_alert} by {node_name} / avg:elasticsearch.search.query.total${module.filter-tags.query_alert} by {node_name} * 1000
|
||||||
> ${var.search_query_latency_threshold_critical}
|
> ${var.search_query_latency_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.search_query_latency_threshold_warning}"
|
warning = "${var.search_query_latency_threshold_warning}"
|
||||||
@ -655,11 +655,11 @@ resource "datadog_monitor" "fetch_latency" {
|
|||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.fetch_latency_time_aggregator}(${var.fetch_latency_timeframe}):
|
${var.fetch_latency_time_aggregator}(${var.fetch_latency_timeframe}):
|
||||||
avg:elasticsearch.search.fetch.time${module.filter-tags.query_alert} by {node_name} / avg:elasticsearch.search.fetch.total${module.filter-tags.query_alert} by {node_name} * 1000
|
avg:elasticsearch.search.fetch.time${module.filter-tags.query_alert} by {node_name} / avg:elasticsearch.search.fetch.total${module.filter-tags.query_alert} by {node_name} * 1000
|
||||||
> ${var.fetch_latency_threshold_critical}
|
> ${var.fetch_latency_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.fetch_latency_threshold_warning}"
|
warning = "${var.fetch_latency_threshold_warning}"
|
||||||
@ -697,11 +697,11 @@ resource "datadog_monitor" "search_query_change" {
|
|||||||
|
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
pct_change(${var.search_query_change_time_aggregator}(${var.search_query_change_timeframe}),${var.search_query_change_timeshift}):
|
pct_change(${var.search_query_change_time_aggregator}(${var.search_query_change_timeframe}),${var.search_query_change_timeshift}):
|
||||||
avg:elasticsearch.search.query.current${module.filter-tags.query_alert} by {cluster_name}
|
avg:elasticsearch.search.query.current${module.filter-tags.query_alert} by {cluster_name}
|
||||||
>= ${var.search_query_change_threshold_critical}
|
>= ${var.search_query_change_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.search_query_change_threshold_warning}"
|
warning = "${var.search_query_change_threshold_warning}"
|
||||||
@ -739,11 +739,11 @@ resource "datadog_monitor" "fetch_change" {
|
|||||||
|
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
pct_change(${var.fetch_change_time_aggregator}(${var.fetch_change_timeframe}),${var.fetch_change_timeshift}):
|
pct_change(${var.fetch_change_time_aggregator}(${var.fetch_change_timeframe}),${var.fetch_change_timeshift}):
|
||||||
avg:elasticsearch.search.fetch.current${module.filter-tags.query_alert} by {cluster_name}
|
avg:elasticsearch.search.fetch.current${module.filter-tags.query_alert} by {cluster_name}
|
||||||
>= ${var.fetch_change_threshold_critical}
|
>= ${var.fetch_change_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.fetch_change_threshold_warning}"
|
warning = "${var.fetch_change_threshold_warning}"
|
||||||
@ -782,11 +782,11 @@ resource "datadog_monitor" "field_data_evictions_change" {
|
|||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
change(${var.field_data_evictions_change_time_aggregator}(${var.field_data_evictions_change_timeframe}),${var.field_data_evictions_change_timeshift}):
|
change(${var.field_data_evictions_change_time_aggregator}(${var.field_data_evictions_change_timeframe}),${var.field_data_evictions_change_timeshift}):
|
||||||
avg:elasticsearch.fielddata.evictions${module.filter-tags.query_alert} by {node_name}
|
avg:elasticsearch.fielddata.evictions${module.filter-tags.query_alert} by {node_name}
|
||||||
> ${var.field_data_evictions_change_threshold_critical}
|
> ${var.field_data_evictions_change_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.field_data_evictions_change_threshold_warning}"
|
warning = "${var.field_data_evictions_change_threshold_warning}"
|
||||||
@ -825,11 +825,11 @@ resource "datadog_monitor" "query_cache_evictions_change" {
|
|||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
change(${var.query_cache_evictions_change_time_aggregator}(${var.query_cache_evictions_change_timeframe}),${var.query_cache_evictions_change_timeshift}):
|
change(${var.query_cache_evictions_change_time_aggregator}(${var.query_cache_evictions_change_timeframe}),${var.query_cache_evictions_change_timeshift}):
|
||||||
avg:elasticsearch.indices.query_cache.evictions${module.filter-tags.query_alert} by {node_name}
|
avg:elasticsearch.indices.query_cache.evictions${module.filter-tags.query_alert} by {node_name}
|
||||||
> ${var.query_cache_evictions_change_threshold_critical}
|
> ${var.query_cache_evictions_change_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.query_cache_evictions_change_threshold_warning}"
|
warning = "${var.query_cache_evictions_change_threshold_warning}"
|
||||||
@ -868,11 +868,11 @@ resource "datadog_monitor" "request_cache_evictions_change" {
|
|||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
change(${var.request_cache_evictions_change_time_aggregator}(${var.request_cache_evictions_change_timeframe}),${var.request_cache_evictions_change_timeshift}):
|
change(${var.request_cache_evictions_change_time_aggregator}(${var.request_cache_evictions_change_timeframe}),${var.request_cache_evictions_change_timeshift}):
|
||||||
avg:elasticsearch.indices.request_cache.evictions${module.filter-tags.query_alert} by {node_name}
|
avg:elasticsearch.indices.request_cache.evictions${module.filter-tags.query_alert} by {node_name}
|
||||||
> ${var.request_cache_evictions_change_threshold_critical}
|
> ${var.request_cache_evictions_change_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.request_cache_evictions_change_threshold_warning}"
|
warning = "${var.request_cache_evictions_change_threshold_warning}"
|
||||||
@ -910,11 +910,11 @@ resource "datadog_monitor" "task_time_in_queue_change" {
|
|||||||
|
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
change(${var.task_time_in_queue_change_time_aggregator}(${var.task_time_in_queue_change_timeframe}),${var.task_time_in_queue_change_timeshift}):
|
change(${var.task_time_in_queue_change_time_aggregator}(${var.task_time_in_queue_change_timeframe}),${var.task_time_in_queue_change_timeshift}):
|
||||||
avg:elasticsearch.pending_tasks_time_in_queue${module.filter-tags.query_alert} by {cluster_name}
|
avg:elasticsearch.pending_tasks_time_in_queue${module.filter-tags.query_alert} by {cluster_name}
|
||||||
> ${var.task_time_in_queue_change_threshold_critical}
|
> ${var.task_time_in_queue_change_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.task_time_in_queue_change_threshold_warning}"
|
warning = "${var.task_time_in_queue_change_threshold_warning}"
|
||||||
|
|||||||
@ -3,10 +3,10 @@ resource "datadog_monitor" "mongodb_primary" {
|
|||||||
name = "[${var.environment}] MongoDB primary state"
|
name = "[${var.environment}] MongoDB primary state"
|
||||||
message = "${coalesce(var.mongodb_primary_message, var.message)}"
|
message = "${coalesce(var.mongodb_primary_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.mongodb_primary_aggregator}(${var.mongodb_primary_timeframe}):
|
${var.mongodb_primary_aggregator}(${var.mongodb_primary_timeframe}):
|
||||||
min:mongodb.replset.state${module.filter-tags.query_alert} by {replset_name} >= 2
|
min:mongodb.replset.state${module.filter-tags.query_alert} by {replset_name} >= 2
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -29,12 +29,12 @@ resource "datadog_monitor" "mongodb_secondary" {
|
|||||||
name = "[${var.environment}] MongoDB secondary missing"
|
name = "[${var.environment}] MongoDB secondary missing"
|
||||||
message = "${coalesce(var.mongodb_secondary_message, var.message)}"
|
message = "${coalesce(var.mongodb_secondary_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.mongodb_secondary_aggregator}(${var.mongodb_secondary_timeframe}):
|
${var.mongodb_secondary_aggregator}(${var.mongodb_secondary_timeframe}):
|
||||||
${var.mongodb_desired_servers_count} -
|
${var.mongodb_desired_servers_count} -
|
||||||
sum:mongodb.replset.health${module.filter-tags.query_alert} by {replset_name}
|
sum:mongodb.replset.health${module.filter-tags.query_alert} by {replset_name}
|
||||||
> 1
|
> 1
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = 1
|
critical = 1
|
||||||
@ -62,11 +62,11 @@ resource "datadog_monitor" "mongodb_server_count" {
|
|||||||
name = "[${var.environment}] MongoDB too much servers or wrong monitoring config"
|
name = "[${var.environment}] MongoDB too much servers or wrong monitoring config"
|
||||||
message = "${coalesce(var.mongodb_server_count_message, var.message)}"
|
message = "${coalesce(var.mongodb_server_count_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.mongodb_server_count_aggregator}(${var.mongodb_server_count_timeframe}):
|
${var.mongodb_server_count_aggregator}(${var.mongodb_server_count_timeframe}):
|
||||||
sum:mongodb.replset.health${module.filter-tags.query_alert} by {replset_name}
|
sum:mongodb.replset.health${module.filter-tags.query_alert} by {replset_name}
|
||||||
> 99
|
> 99
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = 99
|
critical = 99
|
||||||
@ -94,10 +94,10 @@ resource "datadog_monitor" "mongodb_replication" {
|
|||||||
name = "[${var.environment}] MongoDB replication lag"
|
name = "[${var.environment}] MongoDB replication lag"
|
||||||
message = "${coalesce(var.mongodb_replication_message, var.message)}"
|
message = "${coalesce(var.mongodb_replication_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.mongodb_replication_aggregator}(${var.mongodb_replication_timeframe}):
|
${var.mongodb_replication_aggregator}(${var.mongodb_replication_timeframe}):
|
||||||
avg:mongodb.replset.replicationlag${module.filter-tags-secondary.query_alert} by {server} > ${var.mongodb_lag_critical}
|
avg:mongodb.replset.replicationlag${module.filter-tags-secondary.query_alert} by {server} > ${var.mongodb_lag_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.mongodb_lag_critical}"
|
critical = "${var.mongodb_lag_critical}"
|
||||||
|
|||||||
@ -5,9 +5,9 @@ resource "datadog_monitor" "mysql_availability" {
|
|||||||
|
|
||||||
type = "service check"
|
type = "service check"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
"mysql.can_connect"${module.filter-tags.service_check}.by("port","server").last(6).count_by_status()
|
"mysql.can_connect"${module.filter-tags.service_check}.by("port","server").last(6).count_by_status()
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds = {
|
thresholds = {
|
||||||
warning = "${var.mysql_availability_threshold_warning}"
|
warning = "${var.mysql_availability_threshold_warning}"
|
||||||
@ -35,12 +35,12 @@ resource "datadog_monitor" "mysql_connection" {
|
|||||||
message = "${coalesce(var.mysql_connection_message, var.message)}"
|
message = "${coalesce(var.mysql_connection_message, var.message)}"
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.mysql_connection_time_aggregator}(${var.mysql_connection_timeframe}): (
|
${var.mysql_connection_time_aggregator}(${var.mysql_connection_timeframe}): (
|
||||||
avg:mysql.net.connections${module.filter-tags.query_alert} by {server} /
|
avg:mysql.net.connections${module.filter-tags.query_alert} by {server} /
|
||||||
avg:mysql.net.max_connections_available${module.filter-tags.query_alert} by {server}
|
avg:mysql.net.max_connections_available${module.filter-tags.query_alert} by {server}
|
||||||
) * 100 > ${var.mysql_connection_threshold_critical}
|
) * 100 > ${var.mysql_connection_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -67,12 +67,12 @@ resource "datadog_monitor" "mysql_aborted" {
|
|||||||
message = "${coalesce(var.mysql_aborted_message, var.message)}"
|
message = "${coalesce(var.mysql_aborted_message, var.message)}"
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.mysql_aborted_time_aggregator}(${var.mysql_aborted_timeframe}): (
|
${var.mysql_aborted_time_aggregator}(${var.mysql_aborted_timeframe}): (
|
||||||
avg:mysql.net.aborted_connects${module.filter-tags.query_alert} by {server} /
|
avg:mysql.net.aborted_connects${module.filter-tags.query_alert} by {server} /
|
||||||
avg:mysql.performance.threads_connected${module.filter-tags.query_alert} by {server}
|
avg:mysql.performance.threads_connected${module.filter-tags.query_alert} by {server}
|
||||||
) * 100 > ${var.mysql_aborted_threshold_critical}
|
) * 100 > ${var.mysql_aborted_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -99,12 +99,12 @@ resource "datadog_monitor" "mysql_slow" {
|
|||||||
message = "${coalesce(var.mysql_slow_message, var.message)}"
|
message = "${coalesce(var.mysql_slow_message, var.message)}"
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.mysql_slow_time_aggregator}(${var.mysql_slow_timeframe}): (
|
${var.mysql_slow_time_aggregator}(${var.mysql_slow_timeframe}): (
|
||||||
avg:mysql.performance.slow_queries${module.filter-tags.query_alert} by {server} /
|
avg:mysql.performance.slow_queries${module.filter-tags.query_alert} by {server} /
|
||||||
avg:mysql.performance.queries${module.filter-tags.query_alert} by {server}
|
avg:mysql.performance.queries${module.filter-tags.query_alert} by {server}
|
||||||
) * 100 > ${var.mysql_slow_threshold_critical}
|
) * 100 > ${var.mysql_slow_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -131,12 +131,12 @@ resource "datadog_monitor" "mysql_pool_efficiency" {
|
|||||||
message = "${coalesce(var.mysql_pool_efficiency_message, var.message)}"
|
message = "${coalesce(var.mysql_pool_efficiency_message, var.message)}"
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.mysql_pool_efficiency_time_aggregator}(${var.mysql_pool_efficiency_timeframe}): (
|
${var.mysql_pool_efficiency_time_aggregator}(${var.mysql_pool_efficiency_timeframe}): (
|
||||||
avg:mysql.innodb.buffer_pool_reads${module.filter-tags.query_alert} by {server} /
|
avg:mysql.innodb.buffer_pool_reads${module.filter-tags.query_alert} by {server} /
|
||||||
avg:mysql.innodb.buffer_pool_read_requests${module.filter-tags.query_alert} by {server}
|
avg:mysql.innodb.buffer_pool_read_requests${module.filter-tags.query_alert} by {server}
|
||||||
) * 100 > ${var.mysql_pool_efficiency_threshold_critical}
|
) * 100 > ${var.mysql_pool_efficiency_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -163,13 +163,13 @@ resource "datadog_monitor" "mysql_pool_utilization" {
|
|||||||
message = "${coalesce(var.mysql_pool_utilization_message, var.message)}"
|
message = "${coalesce(var.mysql_pool_utilization_message, var.message)}"
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.mysql_pool_utilization_time_aggregator}(${var.mysql_pool_utilization_timeframe}):
|
${var.mysql_pool_utilization_time_aggregator}(${var.mysql_pool_utilization_timeframe}):
|
||||||
( avg:mysql.innodb.buffer_pool_total${module.filter-tags.query_alert} by {server} -
|
( avg:mysql.innodb.buffer_pool_total${module.filter-tags.query_alert} by {server} -
|
||||||
avg:mysql.innodb.buffer_pool_free${module.filter-tags.query_alert} by {server} ) /
|
avg:mysql.innodb.buffer_pool_free${module.filter-tags.query_alert} by {server} ) /
|
||||||
avg:mysql.innodb.buffer_pool_total${module.filter-tags.query_alert} by {server}
|
avg:mysql.innodb.buffer_pool_total${module.filter-tags.query_alert} by {server}
|
||||||
* 100 > ${var.mysql_pool_utilization_threshold_critical}
|
* 100 > ${var.mysql_pool_utilization_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -196,7 +196,7 @@ resource "datadog_monitor" "mysql_threads_anomaly" {
|
|||||||
message = "${coalesce(var.mysql_threads_message, var.message)}"
|
message = "${coalesce(var.mysql_threads_message, var.message)}"
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.mysql_threads_time_aggregator}(${var.mysql_threads_timeframe}):
|
${var.mysql_threads_time_aggregator}(${var.mysql_threads_timeframe}):
|
||||||
anomalies(
|
anomalies(
|
||||||
avg:mysql.performance.threads_running${module.filter-tags.query_alert} by {server},
|
avg:mysql.performance.threads_running${module.filter-tags.query_alert} by {server},
|
||||||
@ -209,7 +209,7 @@ resource "datadog_monitor" "mysql_threads_anomaly" {
|
|||||||
${var.mysql_threads_seasonality == "agile" ? format(",seasonality='%s'", var.mysql_threads_seasonality): ""}
|
${var.mysql_threads_seasonality == "agile" ? format(",seasonality='%s'", var.mysql_threads_seasonality): ""}
|
||||||
)
|
)
|
||||||
>= ${var.mysql_threads_threshold_critical}
|
>= ${var.mysql_threads_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -236,7 +236,7 @@ resource "datadog_monitor" "mysql_questions_anomaly" {
|
|||||||
message = "${coalesce(var.mysql_questions_message, var.message)}"
|
message = "${coalesce(var.mysql_questions_message, var.message)}"
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.mysql_questions_time_aggregator}(${var.mysql_questions_timeframe}):
|
${var.mysql_questions_time_aggregator}(${var.mysql_questions_timeframe}):
|
||||||
anomalies(
|
anomalies(
|
||||||
avg:mysql.performance.questions${module.filter-tags.query_alert} by {server},
|
avg:mysql.performance.questions${module.filter-tags.query_alert} by {server},
|
||||||
@ -249,7 +249,7 @@ resource "datadog_monitor" "mysql_questions_anomaly" {
|
|||||||
${var.mysql_questions_detection_algorithm == "agile" ? format(",seasonality='%s'", var.mysql_questions_seasonality): ""}
|
${var.mysql_questions_detection_algorithm == "agile" ? format(",seasonality='%s'", var.mysql_questions_seasonality): ""}
|
||||||
)
|
)
|
||||||
>= ${var.mysql_questions_threshold_critical}
|
>= ${var.mysql_questions_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
|
|||||||
@ -5,9 +5,9 @@ resource "datadog_monitor" "postgresql_availability" {
|
|||||||
|
|
||||||
type = "service check"
|
type = "service check"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
"postgres.can_connect"${module.filter-tags.service_check}.by("port","server").last(6).count_by_status()
|
"postgres.can_connect"${module.filter-tags.service_check}.by("port","server").last(6).count_by_status()
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds = {
|
thresholds = {
|
||||||
warning = "${var.postgresql_availability_threshold_warning}"
|
warning = "${var.postgresql_availability_threshold_warning}"
|
||||||
@ -35,11 +35,11 @@ resource "datadog_monitor" "postgresql_connection_too_high" {
|
|||||||
message = "${coalesce(var.postgresql_connection_message, var.message)}"
|
message = "${coalesce(var.postgresql_connection_message, var.message)}"
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.postgresql_connection_time_aggregator}(${var.postgresql_connection_timeframe}):
|
${var.postgresql_connection_time_aggregator}(${var.postgresql_connection_timeframe}):
|
||||||
avg:postgresql.percent_usage_connections${module.filter-tags.query_alert} by {server}
|
avg:postgresql.percent_usage_connections${module.filter-tags.query_alert} by {server}
|
||||||
* 100 > ${var.postgresql_connection_threshold_critical}
|
* 100 > ${var.postgresql_connection_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
@ -66,11 +66,11 @@ resource "datadog_monitor" "postgresql_too_many_locks" {
|
|||||||
message = "${coalesce(var.postgresql_lock_message, var.message)}"
|
message = "${coalesce(var.postgresql_lock_message, var.message)}"
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.postgresql_lock_time_aggregator}(${var.postgresql_lock_timeframe}):
|
${var.postgresql_lock_time_aggregator}(${var.postgresql_lock_timeframe}):
|
||||||
default(avg:postgresql.locks${module.filter-tags.query_alert} by {server}, 0)
|
default(avg:postgresql.locks${module.filter-tags.query_alert} by {server}, 0)
|
||||||
> ${var.postgresql_lock_threshold_critical}
|
> ${var.postgresql_lock_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = "${var.evaluation_delay}"
|
evaluation_delay = "${var.evaluation_delay}"
|
||||||
new_host_delay = "${var.new_host_delay}"
|
new_host_delay = "${var.new_host_delay}"
|
||||||
|
|||||||
@ -8,9 +8,9 @@ resource "datadog_monitor" "not_responding" {
|
|||||||
|
|
||||||
type = "service check"
|
type = "service check"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
"redis.can_connect"${module.filter-tags.service_check}.by("redis_host","redis_port").last(6).count_by_status()
|
"redis.can_connect"${module.filter-tags.service_check}.by("redis_host","redis_port").last(6).count_by_status()
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.not_responding_threshold_warning}"
|
warning = "${var.not_responding_threshold_warning}"
|
||||||
@ -38,11 +38,11 @@ resource "datadog_monitor" "evicted_keys" {
|
|||||||
name = "[${var.environment}] Redis evicted keys {{#is_alert}}{{{comparator}}} {{threshold}}% (+{{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% (+{{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Redis evicted keys {{#is_alert}}{{{comparator}}} {{threshold}}% (+{{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% (+{{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.evictedkeys_change_message, var.message)}"
|
message = "${coalesce(var.evictedkeys_change_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOL
|
query = <<EOQ
|
||||||
change(${var.evictedkeys_change_time_aggregator}(${var.evictedkeys_change_timeframe}),${var.evictedkeys_change_timeframe}): (
|
change(${var.evictedkeys_change_time_aggregator}(${var.evictedkeys_change_timeframe}),${var.evictedkeys_change_timeframe}): (
|
||||||
avg:redis.keys.evicted${module.filter-tags.query_alert} by {redis_host,redis_port}
|
avg:redis.keys.evicted${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
) > ${var.evictedkeys_change_threshold_critical}
|
) > ${var.evictedkeys_change_threshold_critical}
|
||||||
EOL
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -71,11 +71,11 @@ resource "datadog_monitor" "expirations" {
|
|||||||
name = "[${var.environment}] Redis expired keys {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Redis expired keys {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.expirations_rate_message, var.message)}"
|
message = "${coalesce(var.expirations_rate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOL
|
query = <<EOQ
|
||||||
${var.expirations_rate_time_aggregator}(${var.expirations_rate_timeframe}): (
|
${var.expirations_rate_time_aggregator}(${var.expirations_rate_timeframe}): (
|
||||||
avg:redis.expires.percent${module.filter-tags.query_alert} by {redis_host,redis_port}
|
avg:redis.expires.percent${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
) > ${var.expirations_rate_threshold_critical}
|
) > ${var.expirations_rate_threshold_critical}
|
||||||
EOL
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -104,12 +104,12 @@ resource "datadog_monitor" "blocked_clients" {
|
|||||||
name = "[${var.environment}] Redis blocked clients {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Redis blocked clients {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.blocked_clients_message, var.message)}"
|
message = "${coalesce(var.blocked_clients_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOL
|
query = <<EOQ
|
||||||
${var.blocked_clients_time_aggregator}(${var.blocked_clients_timeframe}): (
|
${var.blocked_clients_time_aggregator}(${var.blocked_clients_timeframe}): (
|
||||||
sum:redis.clients.blocked${module.filter-tags.query_alert} by {redis_host,redis_port}
|
sum:redis.clients.blocked${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
/ sum:redis.net.clients${module.filter-tags.query_alert} by {redis_host,redis_port}
|
/ sum:redis.net.clients${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
) * 100 > ${var.blocked_clients_threshold_critical}
|
) * 100 > ${var.blocked_clients_threshold_critical}
|
||||||
EOL
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -138,11 +138,11 @@ resource "datadog_monitor" "keyspace_full" {
|
|||||||
name = "[${var.environment}] Redis keyspace seems full (no changes since ${var.keyspace_timeframe})"
|
name = "[${var.environment}] Redis keyspace seems full (no changes since ${var.keyspace_timeframe})"
|
||||||
message = "${coalesce(var.keyspace_message, var.message)}"
|
message = "${coalesce(var.keyspace_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOL
|
query = <<EOQ
|
||||||
${var.keyspace_time_aggregator}(${var.keyspace_timeframe}): (
|
${var.keyspace_time_aggregator}(${var.keyspace_timeframe}): (
|
||||||
abs(diff(avg:redis.keys${module.filter-tags.query_alert} by {redis_host,redis_port}))
|
abs(diff(avg:redis.keys${module.filter-tags.query_alert} by {redis_host,redis_port}))
|
||||||
) == ${var.keyspace_threshold_critical}
|
) == ${var.keyspace_threshold_critical}
|
||||||
EOL
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -171,12 +171,12 @@ resource "datadog_monitor" "memory_used" {
|
|||||||
name = "[${var.environment}] Redis memory used {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Redis memory used {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.mem_used_message, var.message)}"
|
message = "${coalesce(var.mem_used_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOL
|
query = <<EOQ
|
||||||
${var.mem_used_time_aggregator}(${var.mem_used_timeframe}): (
|
${var.mem_used_time_aggregator}(${var.mem_used_timeframe}): (
|
||||||
avg:redis.mem.used${module.filter-tags.query_alert} by {redis_host,redis_port}
|
avg:redis.mem.used${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
/ max:redis.mem.maxmemory${module.filter-tags.query_alert} by {redis_host,redis_port}
|
/ max:redis.mem.maxmemory${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
) * 100 > ${var.mem_used_threshold_critical}
|
) * 100 > ${var.mem_used_threshold_critical}
|
||||||
EOL
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -205,11 +205,11 @@ resource "datadog_monitor" "memory_frag" {
|
|||||||
name = "[${var.environment}] Redis memory fragmented {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Redis memory fragmented {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.mem_frag_message, var.message)}"
|
message = "${coalesce(var.mem_frag_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOL
|
query = <<EOQ
|
||||||
${var.mem_frag_time_aggregator}(${var.mem_frag_timeframe}):
|
${var.mem_frag_time_aggregator}(${var.mem_frag_timeframe}):
|
||||||
avg:redis.mem.fragmentation_ratio${module.filter-tags.query_alert} by {redis_host,redis_port}
|
avg:redis.mem.fragmentation_ratio${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
* 100 > ${var.mem_frag_threshold_critical}
|
* 100 > ${var.mem_frag_threshold_critical}
|
||||||
EOL
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -238,11 +238,11 @@ resource "datadog_monitor" "rejected_connections" {
|
|||||||
name = "[${var.environment}] Redis rejected connections {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
name = "[${var.environment}] Redis rejected connections {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||||
message = "${coalesce(var.rejected_con_message, var.message)}"
|
message = "${coalesce(var.rejected_con_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOL
|
query = <<EOQ
|
||||||
change(${var.rejected_con_time_aggregator}(${var.rejected_con_timeframe}),${var.rejected_con_timeframe}): (
|
change(${var.rejected_con_time_aggregator}(${var.rejected_con_timeframe}),${var.rejected_con_timeframe}): (
|
||||||
avg:redis.net.rejected${module.filter-tags.query_alert} by {redis_host,redis_port}
|
avg:redis.net.rejected${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
) > ${var.rejected_con_threshold_critical}
|
) > ${var.rejected_con_threshold_critical}
|
||||||
EOL
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -271,11 +271,11 @@ resource "datadog_monitor" "latency" {
|
|||||||
name = "[${var.environment}] Redis latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}){{/is_warning}}"
|
name = "[${var.environment}] Redis latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}){{/is_warning}}"
|
||||||
message = "${coalesce(var.latency_message, var.message)}"
|
message = "${coalesce(var.latency_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOL
|
query = <<EOQ
|
||||||
change(${var.latency_time_aggregator}(${var.latency_timeframe}),${var.latency_timeframe}): (
|
change(${var.latency_time_aggregator}(${var.latency_timeframe}),${var.latency_timeframe}): (
|
||||||
avg:redis.info.latency_ms${module.filter-tags.query_alert} by {redis_host,redis_port}
|
avg:redis.info.latency_ms${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
) > ${var.latency_threshold_critical}
|
) > ${var.latency_threshold_critical}
|
||||||
EOL
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -304,13 +304,13 @@ resource "datadog_monitor" "hitrate" {
|
|||||||
name = "[${var.environment}] Redis hitrate {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Redis hitrate {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.hitrate_message, var.message)}"
|
message = "${coalesce(var.hitrate_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOL
|
query = <<EOQ
|
||||||
${var.hitrate_time_aggregator}(${var.hitrate_timeframe}): (
|
${var.hitrate_time_aggregator}(${var.hitrate_timeframe}): (
|
||||||
sum:redis.stats.keyspace_hits${module.filter-tags.query_alert} by {redis_host,redis_port}
|
sum:redis.stats.keyspace_hits${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
/ (sum:redis.stats.keyspace_hits${module.filter-tags.query_alert} by {redis_host,redis_port}
|
/ (sum:redis.stats.keyspace_hits${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
+ sum:redis.stats.keyspace_misses${module.filter-tags.query_alert} by {redis_host,redis_port})
|
+ sum:redis.stats.keyspace_misses${module.filter-tags.query_alert} by {redis_host,redis_port})
|
||||||
) * 100 < ${var.hitrate_threshold_critical}
|
) * 100 < ${var.hitrate_threshold_critical}
|
||||||
EOL
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -5,9 +5,9 @@ resource "datadog_monitor" "datadog_apache_process" {
|
|||||||
|
|
||||||
type = "service check"
|
type = "service check"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
"apache.can_connect"${module.filter-tags.service_check}.by("port","server").last(6).count_by_status()
|
"apache.can_connect"${module.filter-tags.service_check}.by("port","server").last(6).count_by_status()
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds = {
|
thresholds = {
|
||||||
warning = "${var.apache_connect_threshold_warning}"
|
warning = "${var.apache_connect_threshold_warning}"
|
||||||
|
|||||||
@ -5,9 +5,9 @@ resource "datadog_monitor" "datadog_nginx_process" {
|
|||||||
|
|
||||||
type = "service check"
|
type = "service check"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
"nginx.can_connect"${module.filter-tags.service_check}.by("server","port").last(6).count_by_status()
|
"nginx.can_connect"${module.filter-tags.service_check}.by("server","port").last(6).count_by_status()
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds = {
|
thresholds = {
|
||||||
warning = "${var.nginx_connect_threshold_warning}"
|
warning = "${var.nginx_connect_threshold_warning}"
|
||||||
@ -36,7 +36,11 @@ resource "datadog_monitor" "datadog_nginx_dropped_connections" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = "${var.nginx_dropped_time_aggregator}(${var.nginx_dropped_timeframe}):avg:nginx.net.conn_dropped_per_s${module.filter-tags.query_alert} by {host} > ${var.nginx_dropped_threshold_critical}"
|
query = <<EOQ
|
||||||
|
${var.nginx_dropped_time_aggregator}(${var.nginx_dropped_timeframe}):
|
||||||
|
avg:nginx.net.conn_dropped_per_s${module.filter-tags.query_alert} by {host}
|
||||||
|
> ${var.nginx_dropped_threshold_critical}
|
||||||
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
critical = "${var.nginx_dropped_threshold_critical}"
|
critical = "${var.nginx_dropped_threshold_critical}"
|
||||||
|
|||||||
@ -5,9 +5,9 @@ resource "datadog_monitor" "php_fpm_connect" {
|
|||||||
|
|
||||||
type = "service check"
|
type = "service check"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
"php_fpm.can_ping"${module.filter-tags.service_check}.by("ping_url").last(6).count_by_status()
|
"php_fpm.can_ping"${module.filter-tags.service_check}.by("ping_url").last(6).count_by_status()
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds = {
|
thresholds = {
|
||||||
warning = "${var.php_fpm_connect_threshold_warning}"
|
warning = "${var.php_fpm_connect_threshold_warning}"
|
||||||
@ -36,13 +36,13 @@ resource "datadog_monitor" "php_fpm_connect_idle" {
|
|||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.php_fpm_busy_time_aggregator}(${var.php_fpm_busy_timeframe}): (
|
${var.php_fpm_busy_time_aggregator}(${var.php_fpm_busy_timeframe}): (
|
||||||
avg:php_fpm.processes.active${module.filter-tags.query_alert} by {host, pool} /
|
avg:php_fpm.processes.active${module.filter-tags.query_alert} by {host, pool} /
|
||||||
( avg:php_fpm.processes.idle${module.filter-tags.query_alert} by {host, pool} +
|
( avg:php_fpm.processes.idle${module.filter-tags.query_alert} by {host, pool} +
|
||||||
avg:php_fpm.processes.active${module.filter-tags.query_alert} by {host, pool} )
|
avg:php_fpm.processes.active${module.filter-tags.query_alert} by {host, pool} )
|
||||||
) * 100 > ${var.php_fpm_busy_threshold_critical}
|
) * 100 > ${var.php_fpm_busy_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
thresholds {
|
thresholds {
|
||||||
warning = "${var.php_fpm_busy_threshold_warning}"
|
warning = "${var.php_fpm_busy_threshold_warning}"
|
||||||
|
|||||||
@ -3,11 +3,11 @@ resource "datadog_monitor" "cpu" {
|
|||||||
name = "[${var.environment}] CPU usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] CPU usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.cpu_message, var.message)}"
|
message = "${coalesce(var.cpu_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.cpu_time_aggregator}(${var.cpu_timeframe}): (
|
${var.cpu_time_aggregator}(${var.cpu_timeframe}): (
|
||||||
100 - avg:system.cpu.idle${module.filter-tags.query_alert} by {host}
|
100 - avg:system.cpu.idle${module.filter-tags.query_alert} by {host}
|
||||||
) > ${var.cpu_threshold_critical}
|
) > ${var.cpu_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -35,11 +35,11 @@ resource "datadog_monitor" "load" {
|
|||||||
name = "[${var.environment}] CPU load 5 ratio {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
name = "[${var.environment}] CPU load 5 ratio {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||||
message = "${coalesce(var.load_message, var.message)}"
|
message = "${coalesce(var.load_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.load_time_aggregator}(${var.load_timeframe}): (
|
${var.load_time_aggregator}(${var.load_timeframe}): (
|
||||||
avg:system.load.norm.5${module.filter-tags.query_alert} by {host}
|
avg:system.load.norm.5${module.filter-tags.query_alert} by {host}
|
||||||
) > ${var.load_threshold_critical}
|
) > ${var.load_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -67,11 +67,11 @@ resource "datadog_monitor" "disk_space" {
|
|||||||
name = "[${var.environment}] Disk space usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Disk space usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.disk_space_message, var.message)}"
|
message = "${coalesce(var.disk_space_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.disk_space_time_aggregator}(${var.disk_space_timeframe}):
|
${var.disk_space_time_aggregator}(${var.disk_space_timeframe}):
|
||||||
avg:system.disk.in_use${module.filter-tags-disk.query_alert} by {host,device}
|
avg:system.disk.in_use${module.filter-tags-disk.query_alert} by {host,device}
|
||||||
* 100 > ${var.disk_space_threshold_critical}
|
* 100 > ${var.disk_space_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -99,7 +99,7 @@ resource "datadog_monitor" "disk_space_forecast" {
|
|||||||
name = "[${var.environment}] Disk Space usage could reach {{#is_alert}}{{threshold}}%{{/is_alert}} in a near future"
|
name = "[${var.environment}] Disk Space usage could reach {{#is_alert}}{{threshold}}%{{/is_alert}} in a near future"
|
||||||
message = "${coalesce(var.disk_space_forecast_message, var.message)}"
|
message = "${coalesce(var.disk_space_forecast_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.disk_space_forecast_time_aggregator}(${var.disk_space_forecast_timeframe}):
|
${var.disk_space_forecast_time_aggregator}(${var.disk_space_forecast_timeframe}):
|
||||||
forecast(avg:system.disk.in_use${module.filter-tags-disk.query_alert} by {host,device} * 100,
|
forecast(avg:system.disk.in_use${module.filter-tags-disk.query_alert} by {host,device} * 100,
|
||||||
'${var.disk_space_forecast_algorithm}',
|
'${var.disk_space_forecast_algorithm}',
|
||||||
@ -109,7 +109,7 @@ resource "datadog_monitor" "disk_space_forecast" {
|
|||||||
${var.disk_space_forecast_algorithm == "seasonal" ? format("seasonality='%s'", var.disk_space_forecast_seasonal_seasonality): ""}
|
${var.disk_space_forecast_algorithm == "seasonal" ? format("seasonality='%s'", var.disk_space_forecast_seasonal_seasonality): ""}
|
||||||
)
|
)
|
||||||
>= ${var.disk_space_forecast_threshold_critical}
|
>= ${var.disk_space_forecast_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
@ -139,11 +139,11 @@ resource "datadog_monitor" "disk_inodes" {
|
|||||||
name = "[${var.environment}] Disk inodes usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Disk inodes usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.disk_inodes_message, var.message)}"
|
message = "${coalesce(var.disk_inodes_message, var.message)}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.disk_inodes_time_aggregator}(${var.disk_inodes_timeframe}):
|
${var.disk_inodes_time_aggregator}(${var.disk_inodes_timeframe}):
|
||||||
avg:system.fs.inodes.in_use${module.filter-tags-disk.query_alert} by {host,device}
|
avg:system.fs.inodes.in_use${module.filter-tags-disk.query_alert} by {host,device}
|
||||||
* 100 > ${var.disk_inodes_threshold_critical}
|
* 100 > ${var.disk_inodes_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
@ -171,12 +171,12 @@ resource "datadog_monitor" "memory" {
|
|||||||
name = "[${var.environment}] Usable Memory {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Usable Memory {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${var.memory_message}"
|
message = "${var.memory_message}"
|
||||||
|
|
||||||
query = <<EOF
|
query = <<EOQ
|
||||||
${var.memory_time_aggregator}(${var.memory_timeframe}):
|
${var.memory_time_aggregator}(${var.memory_timeframe}):
|
||||||
avg:system.mem.usable${module.filter-tags.query_alert} by {host} /
|
avg:system.mem.usable${module.filter-tags.query_alert} by {host} /
|
||||||
avg:system.mem.total${module.filter-tags.query_alert} by {host} * 100
|
avg:system.mem.total${module.filter-tags.query_alert} by {host} * 100
|
||||||
< ${var.memory_threshold_critical}
|
< ${var.memory_threshold_critical}
|
||||||
EOF
|
EOQ
|
||||||
|
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
|
|||||||
@ -3,7 +3,9 @@ resource "datadog_monitor" "host_unreachable" {
|
|||||||
name = "[${var.environment}] Host unreachable"
|
name = "[${var.environment}] Host unreachable"
|
||||||
message = "${coalesce(var.unreachable_message, var.message)}"
|
message = "${coalesce(var.unreachable_message, var.message)}"
|
||||||
|
|
||||||
query = "\"datadog.agent.up\"${module.filter-tags.service_check}.last(6).count_by_status()"
|
query = <<EOQ
|
||||||
|
"datadog.agent.up"${module.filter-tags.service_check}.last(6).count_by_status()
|
||||||
|
EOQ
|
||||||
|
|
||||||
type = "service check"
|
type = "service check"
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user