Merge branch 'MON-486_changes_for_ter_0.12.6' into 'master'
Resolve MON-486 "Changes for ter 0.12.6" Closes MON-486 See merge request claranet/pt-monitoring/projects/datadog/terraform/monitors!83
This commit is contained in:
commit
e543872391
@ -6,7 +6,7 @@ stages:
|
|||||||
- deploy
|
- deploy
|
||||||
|
|
||||||
auto_update:
|
auto_update:
|
||||||
image: qmanfroi/datadog-terraform:latest
|
image: claranet/datadog-terraform:latest
|
||||||
stage: test
|
stage: test
|
||||||
script:
|
script:
|
||||||
- ./scripts/auto_update.sh ./
|
- ./scripts/auto_update.sh ./
|
||||||
|
|||||||
@ -31,11 +31,11 @@ After any change on this repo, you need to run the `./scripts/auto_update.sh ./`
|
|||||||
|
|
||||||
### Terraform ###
|
### Terraform ###
|
||||||
|
|
||||||
Version >= 0.12 is required to use these modules of monitors.
|
Here is the minimum version required to use these modules of integrations.
|
||||||
|
|
||||||
```
|
```
|
||||||
terraform {
|
terraform {
|
||||||
required_version = "~> 0.12"
|
required_version = ">= 0.12.6"
|
||||||
}
|
}
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
@ -66,7 +66,7 @@ resource "datadog_monitor" "memory_pressure" {
|
|||||||
message = coalesce(var.memory_pressure_message, var.message)
|
message = coalesce(var.memory_pressure_message, var.message)
|
||||||
type = "service check"
|
type = "service check"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
"kubernetes_state.node.memory_pressure"${module.filter-tags.service_check}.by("kubernetescluster","node").last(6).count_by_status()
|
"kubernetes_state.node.memory_pressure"${module.filter-tags.service_check}.by("kubernetescluster","node").last(6).count_by_status()
|
||||||
EOQ
|
EOQ
|
||||||
|
|
||||||
@ -97,7 +97,7 @@ resource "datadog_monitor" "ready" {
|
|||||||
message = coalesce(var.ready_message, var.message)
|
message = coalesce(var.ready_message, var.message)
|
||||||
type = "service check"
|
type = "service check"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
"kubernetes_state.node.ready"${module.filter-tags.service_check}.by("kubernetescluster","node").last(6).count_by_status()
|
"kubernetes_state.node.ready"${module.filter-tags.service_check}.by("kubernetescluster","node").last(6).count_by_status()
|
||||||
EOQ
|
EOQ
|
||||||
|
|
||||||
@ -190,7 +190,7 @@ resource "datadog_monitor" "unregister_net_device" {
|
|||||||
message = coalesce(var.unregister_net_device_message, var.message)
|
message = coalesce(var.unregister_net_device_message, var.message)
|
||||||
type = "event alert"
|
type = "event alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
events('sources:kubernetes priority:all ${module.filter-tags.event_alert} \"UnregisterNetDevice\"').rollup('count').last('${var.unregister_net_device_timeframe}') > ${var.unregister_net_device_threshold_critical}
|
events('sources:kubernetes priority:all ${module.filter-tags.event_alert} \"UnregisterNetDevice\"').rollup('count').last('${var.unregister_net_device_timeframe}') > ${var.unregister_net_device_threshold_critical}
|
||||||
EOQ
|
EOQ
|
||||||
|
|
||||||
@ -215,7 +215,7 @@ resource "datadog_monitor" "node_unschedulable" {
|
|||||||
message = coalesce(var.node_unschedulable_message, var.message)
|
message = coalesce(var.node_unschedulable_message, var.message)
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.node_unschedulable_time_aggregator}(${var.node_unschedulable_timeframe}):
|
${var.node_unschedulable_time_aggregator}(${var.node_unschedulable_timeframe}):
|
||||||
sum:kubernetes_state.node.status${module.filter-tags-unschedulable.query_alert} by {kubernetescluster,node}
|
sum:kubernetes_state.node.status${module.filter-tags-unschedulable.query_alert} by {kubernetescluster,node}
|
||||||
> 0
|
> 0
|
||||||
|
|||||||
@ -66,7 +66,7 @@ resource "datadog_monitor" "replica_available" {
|
|||||||
message = coalesce(var.replica_available_message, var.message)
|
message = coalesce(var.replica_available_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.replica_available_time_aggregator}(${var.replica_available_timeframe}):
|
${var.replica_available_time_aggregator}(${var.replica_available_timeframe}):
|
||||||
max:kubernetes_state.deployment.replicas_desired${module.filter-tags.query_alert} by {namespace, deployment} -
|
max:kubernetes_state.deployment.replicas_desired${module.filter-tags.query_alert} by {namespace, deployment} -
|
||||||
max:kubernetes_state.deployment.replicas_available${module.filter-tags.query_alert} by {namespace, deployment}
|
max:kubernetes_state.deployment.replicas_available${module.filter-tags.query_alert} by {namespace, deployment}
|
||||||
@ -100,7 +100,7 @@ resource "datadog_monitor" "replica_ready" {
|
|||||||
message = coalesce(var.replica_ready_message, var.message)
|
message = coalesce(var.replica_ready_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.replica_available_time_aggregator}(${var.replica_available_timeframe}):
|
${var.replica_available_time_aggregator}(${var.replica_available_timeframe}):
|
||||||
max:kubernetes_state.replicaset.replicas_desired${module.filter-tags.query_alert} by {namespace, replicaset} -
|
max:kubernetes_state.replicaset.replicas_desired${module.filter-tags.query_alert} by {namespace, replicaset} -
|
||||||
max:kubernetes_state.replicaset.replicas_ready${module.filter-tags.query_alert} by {namespace, replicaset}
|
max:kubernetes_state.replicaset.replicas_ready${module.filter-tags.query_alert} by {namespace, replicaset}
|
||||||
|
|||||||
@ -70,7 +70,7 @@ resource "datadog_monitor" "ALB_httpcode_5xx" {
|
|||||||
message = coalesce(var.httpcode_alb_5xx_message, var.message)
|
message = coalesce(var.httpcode_alb_5xx_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.httpcode_alb_5xx_time_aggregator}(${var.httpcode_alb_5xx_timeframe}):
|
${var.httpcode_alb_5xx_time_aggregator}(${var.httpcode_alb_5xx_timeframe}):
|
||||||
default(avg:aws.applicationelb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
default(avg:aws.applicationelb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
||||||
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
@ -103,7 +103,7 @@ resource "datadog_monitor" "ALB_httpcode_4xx" {
|
|||||||
message = coalesce(var.httpcode_alb_4xx_message, var.message)
|
message = coalesce(var.httpcode_alb_4xx_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.httpcode_alb_4xx_time_aggregator}(${var.httpcode_alb_4xx_timeframe}):
|
${var.httpcode_alb_4xx_time_aggregator}(${var.httpcode_alb_4xx_timeframe}):
|
||||||
default(avg:aws.applicationelb.httpcode_elb_4xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
default(avg:aws.applicationelb.httpcode_elb_4xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
|
||||||
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
|
|||||||
@ -72,7 +72,7 @@ resource "datadog_monitor" "API_http_4xx_errors_count" {
|
|||||||
message = coalesce(var.http_4xx_requests_message, var.message)
|
message = coalesce(var.http_4xx_requests_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.http_4xx_requests_time_aggregator}(${var.http_4xx_requests_timeframe}):
|
${var.http_4xx_requests_time_aggregator}(${var.http_4xx_requests_timeframe}):
|
||||||
default(avg:aws.apigateway.4xxerror{${var.filter_tags}} by {region,apiname,stage}.as_rate(), 0) / (
|
default(avg:aws.apigateway.4xxerror{${var.filter_tags}} by {region,apiname,stage}.as_rate(), 0) / (
|
||||||
default(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname,stage}.as_rate() + ${var.artificial_requests_count}, 1))
|
default(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname,stage}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
|
|||||||
@ -67,7 +67,7 @@ resource "datadog_monitor" "elasticache_no_connection" {
|
|||||||
message = coalesce(var.no_connection_message, var.message)
|
message = coalesce(var.no_connection_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.no_connection_time_aggregator}(${var.no_connection_timeframe}): (
|
${var.no_connection_time_aggregator}(${var.no_connection_timeframe}): (
|
||||||
avg:aws.elasticache.curr_connections${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
avg:aws.elasticache.curr_connections${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
||||||
) <= 0
|
) <= 0
|
||||||
@ -96,7 +96,7 @@ resource "datadog_monitor" "elasticache_swap" {
|
|||||||
message = coalesce(var.swap_message, var.message)
|
message = coalesce(var.swap_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.swap_time_aggregator}(${var.swap_timeframe}): (
|
${var.swap_time_aggregator}(${var.swap_timeframe}): (
|
||||||
avg:aws.elasticache.swap_usage${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
avg:aws.elasticache.swap_usage${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
||||||
) > ${var.swap_threshold_critical}
|
) > ${var.swap_threshold_critical}
|
||||||
|
|||||||
@ -69,7 +69,7 @@ resource "datadog_monitor" "redis_replication_lag" {
|
|||||||
message = coalesce(var.replication_lag_message, var.message)
|
message = coalesce(var.replication_lag_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.replication_lag_time_aggregator}(${var.replication_lag_timeframe}): (
|
${var.replication_lag_time_aggregator}(${var.replication_lag_timeframe}): (
|
||||||
avg:aws.elasticache.replication_lag${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
avg:aws.elasticache.replication_lag${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
|
||||||
) > ${var.replication_lag_threshold_critical}
|
) > ${var.replication_lag_threshold_critical}
|
||||||
@ -103,7 +103,7 @@ resource "datadog_monitor" "redis_commands" {
|
|||||||
message = coalesce(var.commands_message, var.message)
|
message = coalesce(var.commands_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
sum(${var.commands_timeframe}): (
|
sum(${var.commands_timeframe}): (
|
||||||
avg:aws.elasticache.get_type_cmds${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count() +
|
avg:aws.elasticache.get_type_cmds${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count() +
|
||||||
avg:aws.elasticache.set_type_cmds${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count()
|
avg:aws.elasticache.set_type_cmds${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count()
|
||||||
|
|||||||
@ -81,7 +81,7 @@ resource "datadog_monitor" "es_cpu_90_15min" {
|
|||||||
message = coalesce(var.cpu_message, var.message)
|
message = coalesce(var.cpu_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.cpu_time_aggregator}(${var.cpu_timeframe}): (
|
${var.cpu_time_aggregator}(${var.cpu_timeframe}): (
|
||||||
avg:aws.es.cpuutilization${module.filter-tags.query_alert} by {region,name}
|
avg:aws.es.cpuutilization${module.filter-tags.query_alert} by {region,name}
|
||||||
) > ${var.cpu_threshold_critical}
|
) > ${var.cpu_threshold_critical}
|
||||||
|
|||||||
@ -75,7 +75,7 @@ resource "datadog_monitor" "ELB_too_much_5xx" {
|
|||||||
message = coalesce(var.elb_5xx_message, var.message)
|
message = coalesce(var.elb_5xx_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
sum(${var.elb_5xx_timeframe}):
|
sum(${var.elb_5xx_timeframe}):
|
||||||
default(avg:aws.elb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
default(avg:aws.elb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
||||||
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
@ -110,7 +110,7 @@ resource "datadog_monitor" "ELB_too_much_4xx_backend" {
|
|||||||
message = coalesce(var.elb_backend_4xx_message, var.message)
|
message = coalesce(var.elb_backend_4xx_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
sum(${var.elb_backend_4xx_timeframe}):
|
sum(${var.elb_backend_4xx_timeframe}):
|
||||||
default(avg:aws.elb.httpcode_backend_4xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
default(avg:aws.elb.httpcode_backend_4xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
|
||||||
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
|
||||||
|
|||||||
@ -74,7 +74,7 @@ resource "datadog_monitor" "rds_replica_lag" {
|
|||||||
message = coalesce(var.replicalag_message, var.message)
|
message = coalesce(var.replicalag_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
avg(${var.replicalag_timeframe}): (
|
avg(${var.replicalag_timeframe}): (
|
||||||
avg:aws.rds.replica_lag${module.filter-tags.query_alert} by {region,name}
|
avg:aws.rds.replica_lag${module.filter-tags.query_alert} by {region,name}
|
||||||
) > ${var.replicalag_threshold_critical}
|
) > ${var.replicalag_threshold_critical}
|
||||||
|
|||||||
@ -70,7 +70,7 @@ resource "datadog_monitor" "apimgt_other_requests" {
|
|||||||
message = coalesce(var.other_requests_message, var.message)
|
message = coalesce(var.other_requests_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.other_requests_time_aggregator}(${var.other_requests_timeframe}): (
|
${var.other_requests_time_aggregator}(${var.other_requests_timeframe}): (
|
||||||
default(avg:azure.apimanagement_service.other_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
default(avg:azure.apimanagement_service.other_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
@ -105,7 +105,7 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" {
|
|||||||
message = coalesce(var.unauthorized_requests_message, var.message)
|
message = coalesce(var.unauthorized_requests_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.unauthorized_requests_time_aggregator}(${var.unauthorized_requests_timeframe}): (
|
${var.unauthorized_requests_time_aggregator}(${var.unauthorized_requests_timeframe}): (
|
||||||
default(avg:azure.apimanagement_service.unauthorized_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
default(avg:azure.apimanagement_service.unauthorized_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
|
|||||||
@ -71,7 +71,7 @@ resource "datadog_monitor" "appservices_http_5xx_errors_count" {
|
|||||||
message = coalesce(var.http_5xx_requests_message, var.message)
|
message = coalesce(var.http_5xx_requests_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.http_5xx_requests_time_aggregator}(${var.http_5xx_requests_timeframe}): (
|
${var.http_5xx_requests_time_aggregator}(${var.http_5xx_requests_timeframe}): (
|
||||||
default(avg:azure.app_services.http5xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) /
|
default(avg:azure.app_services.http5xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) /
|
||||||
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 1)
|
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 1)
|
||||||
@ -105,7 +105,7 @@ resource "datadog_monitor" "appservices_http_4xx_errors_count" {
|
|||||||
message = coalesce(var.http_4xx_requests_message, var.message)
|
message = coalesce(var.http_4xx_requests_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.http_4xx_requests_time_aggregator}(${var.http_4xx_requests_timeframe}): (
|
${var.http_4xx_requests_time_aggregator}(${var.http_4xx_requests_timeframe}): (
|
||||||
default(avg:azure.app_services.http4xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) /
|
default(avg:azure.app_services.http4xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) /
|
||||||
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 1)
|
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 1)
|
||||||
|
|||||||
@ -82,7 +82,7 @@ resource "datadog_monitor" "cosmos_db_5xx_requests" {
|
|||||||
message = coalesce(var.cosmos_db_5xx_requests_message, var.message)
|
message = coalesce(var.cosmos_db_5xx_requests_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.cosmos_db_5xx_request_time_aggregator}(${var.cosmos_db_5xx_request_timeframe}): default( (
|
${var.cosmos_db_5xx_request_time_aggregator}(${var.cosmos_db_5xx_request_timeframe}): default( (
|
||||||
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "500")} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0) +
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "500")} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0) +
|
||||||
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "503")} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0)) /
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "503")} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0)) /
|
||||||
@ -119,7 +119,7 @@ resource "datadog_monitor" "cosmos_db_scaling" {
|
|||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
# List of available status codes : https://docs.microsoft.com/en-us/rest/api/cosmos-db/http-status-codes-for-cosmosdb
|
# List of available status codes : https://docs.microsoft.com/en-us/rest/api/cosmos-db/http-status-codes-for-cosmosdb
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.cosmos_db_scaling_time_aggregator}(${var.cosmos_db_scaling_timeframe}): default(
|
${var.cosmos_db_scaling_time_aggregator}(${var.cosmos_db_scaling_timeframe}): default(
|
||||||
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0) /
|
default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0) /
|
||||||
default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0)
|
default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,databasename,collectionname}.as_rate(), 0)
|
||||||
|
|||||||
@ -71,7 +71,7 @@ resource "datadog_monitor" "eventgrid_unmatched_events" {
|
|||||||
message = coalesce(var.unmatched_events_rate_message, var.message)
|
message = coalesce(var.unmatched_events_rate_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.unmatched_events_rate_time_aggregator}(${var.unmatched_events_rate_timeframe}): (default(
|
${var.unmatched_events_rate_time_aggregator}(${var.unmatched_events_rate_timeframe}): (default(
|
||||||
avg:azure.eventgrid_topics.unmatched_event_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() /
|
avg:azure.eventgrid_topics.unmatched_event_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() /
|
||||||
(avg:azure.eventgrid_topics.publish_success_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() +
|
(avg:azure.eventgrid_topics.publish_success_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() +
|
||||||
|
|||||||
@ -68,7 +68,7 @@ resource "datadog_monitor" "eventhub_errors" {
|
|||||||
message = coalesce(var.errors_rate_message, var.message)
|
message = coalesce(var.errors_rate_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.errors_rate_time_aggregator}(${var.errors_rate_timeframe}): ( (
|
${var.errors_rate_time_aggregator}(${var.errors_rate_timeframe}): ( (
|
||||||
default(avg:azure.eventhub_namespaces.internal_server_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
default(avg:azure.eventhub_namespaces.internal_server_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
default(avg:azure.eventhub_namespaces.server_busy_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
default(avg:azure.eventhub_namespaces.server_busy_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) +
|
||||||
|
|||||||
@ -69,7 +69,7 @@ resource "datadog_monitor" "function_high_threads_count" {
|
|||||||
message = coalesce(var.high_threads_count_message, var.message)
|
message = coalesce(var.high_threads_count_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.high_threads_count_time_aggregator}(${var.high_threads_count_timeframe}):
|
${var.high_threads_count_time_aggregator}(${var.high_threads_count_timeframe}):
|
||||||
default(azure.functions.thread_count${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0)
|
default(azure.functions.thread_count${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0)
|
||||||
> ${var.high_threads_count_threshold_critical}
|
> ${var.high_threads_count_threshold_critical}
|
||||||
|
|||||||
@ -78,7 +78,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
|
|||||||
message = coalesce(var.failed_queryjobs_rate_message, var.message)
|
message = coalesce(var.failed_queryjobs_rate_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.failed_queryjobs_rate_time_aggregator}(${var.failed_queryjobs_rate_timeframe}):
|
${var.failed_queryjobs_rate_time_aggregator}(${var.failed_queryjobs_rate_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.jobs.query_jobs.failure${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.jobs.query_jobs.failure${module.filter-tags.query_alert} by {resource_group,name}.as_rate(), 0) / (
|
||||||
@ -115,7 +115,7 @@ resource "datadog_monitor" "status" {
|
|||||||
message = coalesce(var.status_message, var.message)
|
message = coalesce(var.status_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.status_time_aggregator}(${var.status_timeframe}): (
|
${var.status_time_aggregator}(${var.status_timeframe}): (
|
||||||
avg:azure.devices_iothubs.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.devices_iothubs.status${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) < 1
|
) < 1
|
||||||
@ -210,7 +210,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
|
|||||||
message = coalesce(var.failed_c2d_twin_read_rate_message, var.message)
|
message = coalesce(var.failed_c2d_twin_read_rate_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.failed_c2d_twin_read_rate_time_aggregator}(${var.failed_c2d_twin_read_rate_timeframe}):
|
${var.failed_c2d_twin_read_rate_time_aggregator}(${var.failed_c2d_twin_read_rate_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.c2d.twin.read.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.c2d.twin.read.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
@ -247,7 +247,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
|
|||||||
message = coalesce(var.failed_c2d_twin_update_rate_message, var.message)
|
message = coalesce(var.failed_c2d_twin_update_rate_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.failed_c2d_twin_update_rate_time_aggregator}(${var.failed_c2d_twin_update_rate_timeframe}):
|
${var.failed_c2d_twin_update_rate_time_aggregator}(${var.failed_c2d_twin_update_rate_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.c2d.twin.update.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.c2d.twin.update.failure${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
@ -358,7 +358,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
|
|||||||
message = coalesce(var.dropped_d2c_telemetry_egress_message, var.message)
|
message = coalesce(var.dropped_d2c_telemetry_egress_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.dropped_d2c_telemetry_egress_time_aggregator}(${var.dropped_d2c_telemetry_egress_timeframe}):
|
${var.dropped_d2c_telemetry_egress_time_aggregator}(${var.dropped_d2c_telemetry_egress_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
@ -397,7 +397,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
|
|||||||
message = coalesce(var.orphaned_d2c_telemetry_egress_message, var.message)
|
message = coalesce(var.orphaned_d2c_telemetry_egress_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.orphaned_d2c_telemetry_egress_time_aggregator}(${var.orphaned_d2c_telemetry_egress_timeframe}):
|
${var.orphaned_d2c_telemetry_egress_time_aggregator}(${var.orphaned_d2c_telemetry_egress_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / (
|
||||||
|
|||||||
@ -70,7 +70,7 @@ resource "datadog_monitor" "keyvault_api_latency" {
|
|||||||
message = coalesce(var.status_message, var.message)
|
message = coalesce(var.status_message, var.message)
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.api_latency_time_aggregator}(${var.api_latency_timeframe}):
|
${var.api_latency_time_aggregator}(${var.api_latency_timeframe}):
|
||||||
avg:azure.keyvault_vaults.service_api_latency${module.filter-tags-activity.query_alert} by {name,resource_group,region}
|
avg:azure.keyvault_vaults.service_api_latency${module.filter-tags-activity.query_alert} by {name,resource_group,region}
|
||||||
> ${var.api_latency_threshold_critical}
|
> ${var.api_latency_threshold_critical}
|
||||||
|
|||||||
@ -72,7 +72,7 @@ resource "datadog_monitor" "mysql_io_consumption" {
|
|||||||
message = coalesce(var.io_consumption_message, var.message)
|
message = coalesce(var.io_consumption_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.io_consumption_time_aggregator}(${var.io_consumption_timeframe}): (
|
${var.io_consumption_time_aggregator}(${var.io_consumption_timeframe}): (
|
||||||
avg:azure.dbformysql_servers.io_consumption_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.dbformysql_servers.io_consumption_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.io_consumption_threshold_critical}
|
) > ${var.io_consumption_threshold_critical}
|
||||||
@ -106,7 +106,7 @@ resource "datadog_monitor" "mysql_memory_usage" {
|
|||||||
message = coalesce(var.memory_usage_message, var.message)
|
message = coalesce(var.memory_usage_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.memory_usage_time_aggregator}(${var.memory_usage_timeframe}): (
|
${var.memory_usage_time_aggregator}(${var.memory_usage_timeframe}): (
|
||||||
avg:azure.dbformysql_servers.memory_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.dbformysql_servers.memory_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.memory_usage_threshold_critical}
|
) > ${var.memory_usage_threshold_critical}
|
||||||
|
|||||||
@ -67,7 +67,7 @@ resource "datadog_monitor" "postgresql_free_storage" {
|
|||||||
message = coalesce(var.free_storage_message, var.message)
|
message = coalesce(var.free_storage_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.free_storage_time_aggregator}(${var.free_storage_timeframe}): (
|
${var.free_storage_time_aggregator}(${var.free_storage_timeframe}): (
|
||||||
100 - avg:azure.dbforpostgresql_servers.storage_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
100 - avg:azure.dbforpostgresql_servers.storage_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) < ${var.free_storage_threshold_critical}
|
) < ${var.free_storage_threshold_critical}
|
||||||
@ -101,7 +101,7 @@ resource "datadog_monitor" "postgresql_io_consumption" {
|
|||||||
message = coalesce(var.io_consumption_message, var.message)
|
message = coalesce(var.io_consumption_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.io_consumption_time_aggregator}(${var.io_consumption_timeframe}): (
|
${var.io_consumption_time_aggregator}(${var.io_consumption_timeframe}): (
|
||||||
avg:azure.dbforpostgresql_servers.io_consumption_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.dbforpostgresql_servers.io_consumption_percent${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.io_consumption_threshold_critical}
|
) > ${var.io_consumption_threshold_critical}
|
||||||
|
|||||||
@ -67,7 +67,7 @@ resource "datadog_monitor" "percent_processor_time" {
|
|||||||
message = coalesce(var.percent_processor_time_message, var.message)
|
message = coalesce(var.percent_processor_time_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.percent_processor_time_time_aggregator}(${var.percent_processor_time_timeframe}): (
|
${var.percent_processor_time_time_aggregator}(${var.percent_processor_time_timeframe}): (
|
||||||
avg:azure.cache_redis.percent_processor_time${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.cache_redis.percent_processor_time${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.percent_processor_time_threshold_critical}
|
) > ${var.percent_processor_time_threshold_critical}
|
||||||
@ -101,7 +101,7 @@ resource "datadog_monitor" "server_load" {
|
|||||||
message = coalesce(var.server_load_rate_message, var.message)
|
message = coalesce(var.server_load_rate_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.server_load_rate_time_aggregator}(${var.server_load_rate_timeframe}): (
|
${var.server_load_rate_time_aggregator}(${var.server_load_rate_timeframe}): (
|
||||||
avg:azure.cache_redis.server_load${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.cache_redis.server_load${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.server_load_rate_threshold_critical}
|
) > ${var.server_load_rate_threshold_critical}
|
||||||
|
|||||||
@ -67,7 +67,7 @@ resource "datadog_monitor" "memory_percentage" {
|
|||||||
message = coalesce(var.memory_percentage_message, var.message)
|
message = coalesce(var.memory_percentage_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.memory_percentage_time_aggregator}(${var.memory_percentage_timeframe}): (
|
${var.memory_percentage_time_aggregator}(${var.memory_percentage_timeframe}): (
|
||||||
avg:azure.web_serverfarms.memory_percentage${module.filter-tags.query_alert} by {resource_group,region,name,instance}
|
avg:azure.web_serverfarms.memory_percentage${module.filter-tags.query_alert} by {resource_group,region,name,instance}
|
||||||
) > ${var.memory_percentage_threshold_critical}
|
) > ${var.memory_percentage_threshold_critical}
|
||||||
|
|||||||
@ -62,7 +62,7 @@ resource "datadog_monitor" "service_bus_user_errors" {
|
|||||||
message = coalesce(var.user_errors_message, var.message)
|
message = coalesce(var.user_errors_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.user_errors_time_aggregator}(${var.user_errors_timeframe}): (
|
${var.user_errors_time_aggregator}(${var.user_errors_timeframe}): (
|
||||||
default(avg:azure.servicebus_namespaces.user_errors.preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 0) /
|
default(avg:azure.servicebus_namespaces.user_errors.preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 0) /
|
||||||
default(avg:azure.servicebus_namespaces.incoming_requests_preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 1)
|
default(avg:azure.servicebus_namespaces.incoming_requests_preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 1)
|
||||||
@ -97,7 +97,7 @@ resource "datadog_monitor" "service_bus_server_errors" {
|
|||||||
message = coalesce(var.server_errors_message, var.message)
|
message = coalesce(var.server_errors_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.server_errors_time_aggregator}(${var.server_errors_timeframe}): (
|
${var.server_errors_time_aggregator}(${var.server_errors_timeframe}): (
|
||||||
default(avg:azure.servicebus_namespaces.server_errors.preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 0) /
|
default(avg:azure.servicebus_namespaces.server_errors.preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 0) /
|
||||||
default(avg:azure.servicebus_namespaces.incoming_requests_preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 1)
|
default(avg:azure.servicebus_namespaces.incoming_requests_preview${module.filter-tags.query_alert} by {resource_group,region,name,entityname}, 1)
|
||||||
|
|||||||
@ -67,7 +67,7 @@ resource "datadog_monitor" "sql-database_free_space_low" {
|
|||||||
message = coalesce(var.diskspace_message, var.message)
|
message = coalesce(var.diskspace_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.diskspace_time_aggregator}(${var.diskspace_timeframe}): (
|
${var.diskspace_time_aggregator}(${var.diskspace_timeframe}): (
|
||||||
avg:azure.sql_servers_databases.storage_percent${module.filter-tags.query_alert} by {resource_group,region,server_name,name}
|
avg:azure.sql_servers_databases.storage_percent${module.filter-tags.query_alert} by {resource_group,region,server_name,name}
|
||||||
) > ${var.diskspace_threshold_critical}
|
) > ${var.diskspace_threshold_critical}
|
||||||
@ -101,7 +101,7 @@ resource "datadog_monitor" "sql-database_dtu_consumption_high" {
|
|||||||
message = coalesce(var.dtu_message, var.message)
|
message = coalesce(var.dtu_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.dtu_time_aggregator}(${var.dtu_timeframe}): (
|
${var.dtu_time_aggregator}(${var.dtu_timeframe}): (
|
||||||
avg:azure.sql_servers_databases.dtu_consumption_percent${module.filter-tags.query_alert} by {resource_group,region,server_name,name}
|
avg:azure.sql_servers_databases.dtu_consumption_percent${module.filter-tags.query_alert} by {resource_group,region,server_name,name}
|
||||||
) > ${var.dtu_threshold_critical}
|
) > ${var.dtu_threshold_critical}
|
||||||
|
|||||||
@ -72,7 +72,7 @@ resource "datadog_monitor" "sql_elasticpool_dtu_consumption_high" {
|
|||||||
message = coalesce(var.dtu_message, var.message)
|
message = coalesce(var.dtu_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.dtu_time_aggregator}(${var.dtu_timeframe}): (
|
${var.dtu_time_aggregator}(${var.dtu_timeframe}): (
|
||||||
azure.sql_servers_elasticpools.dtu_consumption_percent${module.filter-tags.query_alert} by {resource_group,region,server_name,name}
|
azure.sql_servers_elasticpools.dtu_consumption_percent${module.filter-tags.query_alert} by {resource_group,region,server_name,name}
|
||||||
) > ${var.dtu_threshold_critical}
|
) > ${var.dtu_threshold_critical}
|
||||||
|
|||||||
@ -74,7 +74,7 @@ resource "datadog_monitor" "fileservices_requests_error" {
|
|||||||
count = var.successful_requests_enabled == "true" ? 1 : 0
|
count = var.successful_requests_enabled == "true" ? 1 : 0
|
||||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Azure Storage File service too few successful requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Azure Storage File service too few successful requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = coalesce(var.successful_requests_message, var.message)
|
message = coalesce(var.successful_requests_message, var.message)
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.successful_requests_time_aggregator}(${var.successful_requests_timeframe}):
|
${var.successful_requests_time_aggregator}(${var.successful_requests_timeframe}):
|
||||||
default(100-(default(sum:azure.storage_storageaccounts_fileservices.transactions${module.filter-tags-success.query_alert} by {name}.as_rate(),0) /
|
default(100-(default(sum:azure.storage_storageaccounts_fileservices.transactions${module.filter-tags-success.query_alert} by {name}.as_rate(),0) /
|
||||||
default(sum:azure.storage_storageaccounts_fileservices.transactions${module.filter-tags.query_alert} by {name}.as_rate(),0)
|
default(sum:azure.storage_storageaccounts_fileservices.transactions${module.filter-tags.query_alert} by {name}.as_rate(),0)
|
||||||
@ -939,7 +939,7 @@ resource "datadog_monitor" "blob_client_other_error_requests" {
|
|||||||
count = var.client_other_error_requests_enabled == "true" ? 1 : 0
|
count = var.client_other_error_requests_enabled == "true" ? 1 : 0
|
||||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Azure Blob Storage too many client_other errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Azure Blob Storage too many client_other errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = coalesce(var.client_other_error_requests_message, var.message)
|
message = coalesce(var.client_other_error_requests_message, var.message)
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.client_other_error_requests_time_aggregator}(${var.client_other_error_requests_timeframe}):
|
${var.client_other_error_requests_time_aggregator}(${var.client_other_error_requests_timeframe}):
|
||||||
default((default(sum:azure.storage_storageaccounts_blobservices.transactions${module.filter-tags-client-other-error.query_alert} by {name}.as_rate(),0) /
|
default((default(sum:azure.storage_storageaccounts_blobservices.transactions${module.filter-tags-client-other-error.query_alert} by {name}.as_rate(),0) /
|
||||||
default(sum:azure.storage_storageaccounts_blobservices.transactions${module.filter-tags.query_alert} by {name}.as_rate(),0)
|
default(sum:azure.storage_storageaccounts_blobservices.transactions${module.filter-tags.query_alert} by {name}.as_rate(),0)
|
||||||
|
|||||||
@ -67,7 +67,7 @@ resource "datadog_monitor" "failed_function_requests" {
|
|||||||
message = coalesce(var.failed_function_requests_message, var.message)
|
message = coalesce(var.failed_function_requests_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.failed_function_requests_time_aggregator}(${var.failed_function_requests_timeframe}): (
|
${var.failed_function_requests_time_aggregator}(${var.failed_function_requests_timeframe}): (
|
||||||
default(avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
default(avg:azure.streamanalytics_streamingjobs.aml_callout_failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
|
||||||
default(avg:azure.streamanalytics_streamingjobs.aml_callout_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
default(avg:azure.streamanalytics_streamingjobs.aml_callout_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
|
||||||
@ -102,7 +102,7 @@ resource "datadog_monitor" "conversion_errors" {
|
|||||||
message = coalesce(var.conversion_errors_message, var.message)
|
message = coalesce(var.conversion_errors_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.conversion_errors_time_aggregator}(${var.conversion_errors_timeframe}): (
|
${var.conversion_errors_time_aggregator}(${var.conversion_errors_timeframe}): (
|
||||||
avg:azure.streamanalytics_streamingjobs.conversion_errors${module.filter-tags.query_alert} by {resource_group,region,name}
|
avg:azure.streamanalytics_streamingjobs.conversion_errors${module.filter-tags.query_alert} by {resource_group,region,name}
|
||||||
) > ${var.conversion_errors_threshold_critical}
|
) > ${var.conversion_errors_threshold_critical}
|
||||||
|
|||||||
@ -67,7 +67,7 @@ resource "datadog_monitor" "virtualmachine_credit_cpu_remaining_too_low" {
|
|||||||
message = coalesce(var.cpu_remaining_rate_message, var.message)
|
message = coalesce(var.cpu_remaining_rate_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.cpu_remaining_rate_time_aggregator}(${var.cpu_remaining_rate_timeframe}):
|
${var.cpu_remaining_rate_time_aggregator}(${var.cpu_remaining_rate_timeframe}):
|
||||||
default(
|
default(
|
||||||
default(avg:azure.vm.cpu_credits_remaining${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 100) / (
|
default(avg:azure.vm.cpu_credits_remaining${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 100) / (
|
||||||
|
|||||||
@ -81,7 +81,7 @@ resource "datadog_monitor" "scanned_bytes" {
|
|||||||
message = coalesce(var.scanned_bytes_message, var.message)
|
message = coalesce(var.scanned_bytes_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
avg(${var.scanned_bytes_timeframe}):
|
avg(${var.scanned_bytes_timeframe}):
|
||||||
default(avg:gcp.bigquery.query.scanned_bytes{${var.filter_tags}}, 0)
|
default(avg:gcp.bigquery.query.scanned_bytes{${var.filter_tags}}, 0)
|
||||||
> ${var.scanned_bytes_threshold_critical}
|
> ${var.scanned_bytes_threshold_critical}
|
||||||
@ -118,7 +118,7 @@ resource "datadog_monitor" "scanned_bytes_billed" {
|
|||||||
message = coalesce(var.scanned_bytes_billed_message, var.message)
|
message = coalesce(var.scanned_bytes_billed_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
avg(${var.scanned_bytes_billed_timeframe}):
|
avg(${var.scanned_bytes_billed_timeframe}):
|
||||||
default(avg:gcp.bigquery.query.scanned_bytes_billed{${var.filter_tags}}, 0)
|
default(avg:gcp.bigquery.query.scanned_bytes_billed{${var.filter_tags}}, 0)
|
||||||
> ${var.scanned_bytes_billed_threshold_critical}
|
> ${var.scanned_bytes_billed_threshold_critical}
|
||||||
@ -229,7 +229,7 @@ resource "datadog_monitor" "table_count" {
|
|||||||
message = coalesce(var.table_count_message, var.message)
|
message = coalesce(var.table_count_message, var.message)
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
avg(${var.table_count_timeframe}):
|
avg(${var.table_count_timeframe}):
|
||||||
avg:gcp.bigquery.storage.table_count{${var.filter_tags}} by {dataset_id}
|
avg:gcp.bigquery.storage.table_count{${var.filter_tags}} by {dataset_id}
|
||||||
> ${var.table_count_threshold_critical}
|
> ${var.table_count_threshold_critical}
|
||||||
@ -266,7 +266,7 @@ resource "datadog_monitor" "uploaded_bytes" {
|
|||||||
message = coalesce(var.uploaded_bytes_message, var.message)
|
message = coalesce(var.uploaded_bytes_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
avg(${var.uploaded_bytes_timeframe}):
|
avg(${var.uploaded_bytes_timeframe}):
|
||||||
default(avg:gcp.bigquery.storage.uploaded_bytes{${var.filter_tags}} by {dataset_id,table}, 0)
|
default(avg:gcp.bigquery.storage.uploaded_bytes{${var.filter_tags}} by {dataset_id,table}, 0)
|
||||||
> ${var.uploaded_bytes_threshold_critical}
|
> ${var.uploaded_bytes_threshold_critical}
|
||||||
|
|||||||
@ -81,22 +81,15 @@ resource "datadog_monitor" "disk_utilization_forecast" {
|
|||||||
message = coalesce(var.disk_utilization_forecast_message, var.message)
|
message = coalesce(var.disk_utilization_forecast_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.disk_utilization_forecast_time_aggregator}(${var.disk_utilization_forecast_timeframe}):
|
${var.disk_utilization_forecast_time_aggregator}(${var.disk_utilization_forecast_timeframe}):
|
||||||
forecast(
|
forecast(
|
||||||
avg:gcp.cloudsql.database.disk.utilization{${var.filter_tags}} by {database_id} * 100,
|
avg:gcp.cloudsql.database.disk.utilization{${var.filter_tags}} by {database_id} * 100,
|
||||||
'${var.disk_utilization_forecast_algorithm}',
|
'${var.disk_utilization_forecast_algorithm}',
|
||||||
${var.disk_utilization_forecast_deviations},
|
${var.disk_utilization_forecast_deviations},
|
||||||
interval='${var.disk_utilization_forecast_interval}',
|
interval='${var.disk_utilization_forecast_interval}',
|
||||||
${var.disk_utilization_forecast_algorithm == "linear" ? format(
|
${var.disk_utilization_forecast_algorithm == "linear" ? format("history='%s',model='%s'", var.disk_utilization_forecast_linear_history, var.disk_utilization_forecast_linear_model) : ""}
|
||||||
"history='%s',model='%s'",
|
${var.disk_utilization_forecast_algorithm == "seasonal" ? format("seasonality='%s'", var.disk_utilization_forecast_seasonal_seasonality) : ""}
|
||||||
var.disk_utilization_forecast_linear_history,
|
|
||||||
var.disk_utilization_forecast_linear_model,
|
|
||||||
) : ""}
|
|
||||||
${var.disk_utilization_forecast_algorithm == "seasonal" ? format(
|
|
||||||
"seasonality='%s'",
|
|
||||||
var.disk_utilization_forecast_seasonal_seasonality,
|
|
||||||
) : ""}
|
|
||||||
)
|
)
|
||||||
>= ${var.disk_utilization_forecast_threshold_critical}
|
>= ${var.disk_utilization_forecast_threshold_critical}
|
||||||
EOQ
|
EOQ
|
||||||
@ -170,21 +163,14 @@ resource "datadog_monitor" "memory_utilization_forecast" {
|
|||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.memory_utilization_forecast_time_aggregator}(${var.memory_utilization_forecast_timeframe}):
|
${var.memory_utilization_forecast_time_aggregator}(${var.memory_utilization_forecast_timeframe}):
|
||||||
forecast(
|
forecast(
|
||||||
avg:gcp.cloudsql.database.memory.utilization{${var.filter_tags}} by {database_id} * 100,
|
avg:gcp.cloudsql.database.memory.utilization{${var.filter_tags}} by {database_id} * 100,
|
||||||
'${var.memory_utilization_forecast_algorithm}',
|
'${var.memory_utilization_forecast_algorithm}',
|
||||||
${var.memory_utilization_forecast_deviations},
|
${var.memory_utilization_forecast_deviations},
|
||||||
interval='${var.memory_utilization_forecast_interval}',
|
interval='${var.memory_utilization_forecast_interval}',
|
||||||
${var.memory_utilization_forecast_algorithm == "linear" ? format(
|
${var.memory_utilization_forecast_algorithm == "linear" ? format("history='%s',model='%s'", var.memory_utilization_forecast_linear_history, var.memory_utilization_forecast_linear_model) : ""}
|
||||||
"history='%s',model='%s'",
|
${var.memory_utilization_forecast_algorithm == "seasonal" ? format("seasonality='%s'", var.memory_utilization_forecast_seasonal_seasonality) : ""}
|
||||||
var.memory_utilization_forecast_linear_history,
|
|
||||||
var.memory_utilization_forecast_linear_model,
|
|
||||||
) : ""}
|
|
||||||
${var.memory_utilization_forecast_algorithm == "seasonal" ? format(
|
|
||||||
"seasonality='%s'",
|
|
||||||
var.memory_utilization_forecast_seasonal_seasonality,
|
|
||||||
) : ""}
|
|
||||||
)
|
)
|
||||||
>= ${var.memory_utilization_forecast_threshold_critical}
|
>= ${var.memory_utilization_forecast_threshold_critical}
|
||||||
EOQ
|
EOQ
|
||||||
@ -220,7 +206,7 @@ resource "datadog_monitor" "failover_unavailable" {
|
|||||||
message = coalesce(var.failover_unavailable_message, var.message)
|
message = coalesce(var.failover_unavailable_message, var.message)
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.failover_unavailable_time_aggregator}(${var.failover_unavailable_timeframe}):
|
${var.failover_unavailable_time_aggregator}(${var.failover_unavailable_timeframe}):
|
||||||
avg:gcp.cloudsql.database.available_for_failover{${var.filter_tags}}
|
avg:gcp.cloudsql.database.available_for_failover{${var.filter_tags}}
|
||||||
by {database_id}
|
by {database_id}
|
||||||
|
|||||||
@ -87,7 +87,7 @@ resource "datadog_monitor" "disk_throttled_ops" {
|
|||||||
message = coalesce(var.disk_throttled_ops_message, var.message)
|
message = coalesce(var.disk_throttled_ops_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.disk_throttled_ops_time_aggregator}(${var.disk_throttled_ops_timeframe}):
|
${var.disk_throttled_ops_time_aggregator}(${var.disk_throttled_ops_timeframe}):
|
||||||
(
|
(
|
||||||
sum:gcp.gce.instance.disk.throttled_read_ops_count{${var.filter_tags}} by {instance_name, device_name} +
|
sum:gcp.gce.instance.disk.throttled_read_ops_count{${var.filter_tags}} by {instance_name, device_name} +
|
||||||
|
|||||||
@ -83,7 +83,7 @@ resource "datadog_monitor" "backend_latency_service" {
|
|||||||
message = coalesce(var.backend_latency_service_message, var.message)
|
message = coalesce(var.backend_latency_service_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.backend_latency_service_time_aggregator}(${var.backend_latency_service_timeframe}):
|
${var.backend_latency_service_time_aggregator}(${var.backend_latency_service_timeframe}):
|
||||||
default(min:gcp.loadbalancing.https.backend_latencies.avg{${var.filter_tags},backend_target_type:backend_service} by {backend_target_name,forwarding_rule_name}, 0)
|
default(min:gcp.loadbalancing.https.backend_latencies.avg{${var.filter_tags},backend_target_type:backend_service} by {backend_target_name,forwarding_rule_name}, 0)
|
||||||
> ${var.backend_latency_service_threshold_critical}
|
> ${var.backend_latency_service_threshold_critical}
|
||||||
@ -120,7 +120,7 @@ resource "datadog_monitor" "backend_latency_bucket" {
|
|||||||
message = coalesce(var.backend_latency_bucket_message, var.message)
|
message = coalesce(var.backend_latency_bucket_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.backend_latency_bucket_time_aggregator}(${var.backend_latency_bucket_timeframe}):
|
${var.backend_latency_bucket_time_aggregator}(${var.backend_latency_bucket_timeframe}):
|
||||||
default(min:gcp.loadbalancing.https.backend_latencies.avg{${var.filter_tags},backend_target_type:backend_bucket} by {backend_target_name,forwarding_rule_name}, 0)
|
default(min:gcp.loadbalancing.https.backend_latencies.avg{${var.filter_tags},backend_target_type:backend_bucket} by {backend_target_name,forwarding_rule_name}, 0)
|
||||||
> ${var.backend_latency_bucket_threshold_critical}
|
> ${var.backend_latency_bucket_threshold_critical}
|
||||||
|
|||||||
@ -78,7 +78,7 @@ resource "datadog_monitor" "cluster_initializing_shards" {
|
|||||||
message = coalesce(var.cluster_initializing_shards_message, var.message)
|
message = coalesce(var.cluster_initializing_shards_message, var.message)
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.cluster_initializing_shards_time_aggregator}(${var.cluster_initializing_shards_timeframe}):
|
${var.cluster_initializing_shards_time_aggregator}(${var.cluster_initializing_shards_timeframe}):
|
||||||
avg:elasticsearch.initializing_shards${module.filter-tags.query_alert} by {cluster_name}
|
avg:elasticsearch.initializing_shards${module.filter-tags.query_alert} by {cluster_name}
|
||||||
> ${var.cluster_initializing_shards_threshold_critical}
|
> ${var.cluster_initializing_shards_threshold_critical}
|
||||||
@ -113,7 +113,7 @@ resource "datadog_monitor" "cluster_relocating_shards" {
|
|||||||
message = coalesce(var.cluster_relocating_shards_message, var.message)
|
message = coalesce(var.cluster_relocating_shards_message, var.message)
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.cluster_relocating_shards_time_aggregator}(${var.cluster_relocating_shards_timeframe}):
|
${var.cluster_relocating_shards_time_aggregator}(${var.cluster_relocating_shards_timeframe}):
|
||||||
avg:elasticsearch.relocating_shards${module.filter-tags.query_alert} by {cluster_name}
|
avg:elasticsearch.relocating_shards${module.filter-tags.query_alert} by {cluster_name}
|
||||||
> ${var.cluster_relocating_shards_threshold_critical}
|
> ${var.cluster_relocating_shards_threshold_critical}
|
||||||
@ -222,7 +222,7 @@ resource "datadog_monitor" "jvm_heap_memory_usage" {
|
|||||||
message = coalesce(var.jvm_heap_memory_usage_message, var.message)
|
message = coalesce(var.jvm_heap_memory_usage_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.jvm_heap_memory_usage_time_aggregator}(${var.jvm_heap_memory_usage_timeframe}):
|
${var.jvm_heap_memory_usage_time_aggregator}(${var.jvm_heap_memory_usage_timeframe}):
|
||||||
avg:jvm.mem.heap_in_use${module.filter-tags.query_alert} by {node_name}
|
avg:jvm.mem.heap_in_use${module.filter-tags.query_alert} by {node_name}
|
||||||
> ${var.jvm_heap_memory_usage_threshold_critical}
|
> ${var.jvm_heap_memory_usage_threshold_critical}
|
||||||
@ -257,7 +257,7 @@ resource "datadog_monitor" "jvm_memory_young_usage" {
|
|||||||
message = coalesce(var.jvm_memory_young_usage_message, var.message)
|
message = coalesce(var.jvm_memory_young_usage_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.jvm_memory_young_usage_time_aggregator}(${var.jvm_memory_young_usage_timeframe}):
|
${var.jvm_memory_young_usage_time_aggregator}(${var.jvm_memory_young_usage_timeframe}):
|
||||||
avg:jvm.mem.pools.young.used${module.filter-tags.query_alert} by {node_name} / avg:jvm.mem.pools.young.max${module.filter-tags.query_alert} by {node_name} * 100
|
avg:jvm.mem.pools.young.used${module.filter-tags.query_alert} by {node_name} / avg:jvm.mem.pools.young.max${module.filter-tags.query_alert} by {node_name} * 100
|
||||||
> ${var.jvm_memory_young_usage_threshold_critical}
|
> ${var.jvm_memory_young_usage_threshold_critical}
|
||||||
@ -362,7 +362,7 @@ resource "datadog_monitor" "jvm_gc_young_collection_latency" {
|
|||||||
message = coalesce(var.jvm_gc_young_collection_latency_message, var.message)
|
message = coalesce(var.jvm_gc_young_collection_latency_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.jvm_gc_young_collection_latency_time_aggregator}(${var.jvm_gc_young_collection_latency_timeframe}):
|
${var.jvm_gc_young_collection_latency_time_aggregator}(${var.jvm_gc_young_collection_latency_timeframe}):
|
||||||
avg:jvm.gc.collectors.young.collection_time${module.filter-tags.query_alert} by {node_name} / avg:jvm.gc.collectors.young.count${module.filter-tags.query_alert} by {node_name} * 1000
|
avg:jvm.gc.collectors.young.collection_time${module.filter-tags.query_alert} by {node_name} / avg:jvm.gc.collectors.young.count${module.filter-tags.query_alert} by {node_name} * 1000
|
||||||
> ${var.jvm_gc_young_collection_latency_threshold_critical}
|
> ${var.jvm_gc_young_collection_latency_threshold_critical}
|
||||||
@ -398,7 +398,7 @@ resource "datadog_monitor" "indexing_latency" {
|
|||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.indexing_latency_time_aggregator}(${var.indexing_latency_timeframe}):
|
${var.indexing_latency_time_aggregator}(${var.indexing_latency_timeframe}):
|
||||||
avg:elasticsearch.indexing.index.time${module.filter-tags.query_alert} by {node_name}/ avg:elasticsearch.indexing.index.total${module.filter-tags.query_alert} by {node_name} * 1000
|
avg:elasticsearch.indexing.index.time${module.filter-tags.query_alert} by {node_name}/ avg:elasticsearch.indexing.index.total${module.filter-tags.query_alert} by {node_name} * 1000
|
||||||
> ${var.indexing_latency_threshold_critical}
|
> ${var.indexing_latency_threshold_critical}
|
||||||
@ -518,7 +518,7 @@ resource "datadog_monitor" "search_query_latency" {
|
|||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.search_query_latency_time_aggregator}(${var.search_query_latency_timeframe}):
|
${var.search_query_latency_time_aggregator}(${var.search_query_latency_timeframe}):
|
||||||
avg:elasticsearch.search.query.time${module.filter-tags.query_alert} by {node_name} / avg:elasticsearch.search.query.total${module.filter-tags.query_alert} by {node_name} * 1000
|
avg:elasticsearch.search.query.time${module.filter-tags.query_alert} by {node_name} / avg:elasticsearch.search.query.total${module.filter-tags.query_alert} by {node_name} * 1000
|
||||||
> ${var.search_query_latency_threshold_critical}
|
> ${var.search_query_latency_threshold_critical}
|
||||||
@ -554,7 +554,7 @@ resource "datadog_monitor" "fetch_latency" {
|
|||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.fetch_latency_time_aggregator}(${var.fetch_latency_timeframe}):
|
${var.fetch_latency_time_aggregator}(${var.fetch_latency_timeframe}):
|
||||||
avg:elasticsearch.search.fetch.time${module.filter-tags.query_alert} by {node_name} / avg:elasticsearch.search.fetch.total${module.filter-tags.query_alert} by {node_name} * 1000
|
avg:elasticsearch.search.fetch.time${module.filter-tags.query_alert} by {node_name} / avg:elasticsearch.search.fetch.total${module.filter-tags.query_alert} by {node_name} * 1000
|
||||||
> ${var.fetch_latency_threshold_critical}
|
> ${var.fetch_latency_threshold_critical}
|
||||||
@ -660,7 +660,7 @@ resource "datadog_monitor" "field_data_evictions_change" {
|
|||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
change(${var.field_data_evictions_change_time_aggregator}(${var.field_data_evictions_change_timeframe}),${var.field_data_evictions_change_timeshift}):
|
change(${var.field_data_evictions_change_time_aggregator}(${var.field_data_evictions_change_timeframe}),${var.field_data_evictions_change_timeshift}):
|
||||||
avg:elasticsearch.fielddata.evictions${module.filter-tags.query_alert} by {node_name}
|
avg:elasticsearch.fielddata.evictions${module.filter-tags.query_alert} by {node_name}
|
||||||
> ${var.field_data_evictions_change_threshold_critical}
|
> ${var.field_data_evictions_change_threshold_critical}
|
||||||
@ -696,7 +696,7 @@ resource "datadog_monitor" "query_cache_evictions_change" {
|
|||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
change(${var.query_cache_evictions_change_time_aggregator}(${var.query_cache_evictions_change_timeframe}),${var.query_cache_evictions_change_timeshift}):
|
change(${var.query_cache_evictions_change_time_aggregator}(${var.query_cache_evictions_change_timeframe}),${var.query_cache_evictions_change_timeshift}):
|
||||||
avg:elasticsearch.indices.query_cache.evictions${module.filter-tags.query_alert} by {node_name}
|
avg:elasticsearch.indices.query_cache.evictions${module.filter-tags.query_alert} by {node_name}
|
||||||
> ${var.query_cache_evictions_change_threshold_critical}
|
> ${var.query_cache_evictions_change_threshold_critical}
|
||||||
|
|||||||
@ -8,6 +8,7 @@ resource "datadog_monitor" "mongodb_primary" {
|
|||||||
${var.mongodb_primary_aggregator}(${var.mongodb_primary_timeframe}):
|
${var.mongodb_primary_aggregator}(${var.mongodb_primary_timeframe}):
|
||||||
min:mongodb.replset.state${module.filter-tags.query_alert} by {replset_name} >= 2
|
min:mongodb.replset.state${module.filter-tags.query_alert} by {replset_name} >= 2
|
||||||
EOQ
|
EOQ
|
||||||
|
|
||||||
evaluation_delay = var.evaluation_delay
|
evaluation_delay = var.evaluation_delay
|
||||||
new_host_delay = var.new_host_delay
|
new_host_delay = var.new_host_delay
|
||||||
notify_no_data = true
|
notify_no_data = true
|
||||||
@ -64,7 +65,7 @@ resource "datadog_monitor" "mongodb_server_count" {
|
|||||||
message = coalesce(var.mongodb_server_count_message, var.message)
|
message = coalesce(var.mongodb_server_count_message, var.message)
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.mongodb_server_count_aggregator}(${var.mongodb_server_count_timeframe}):
|
${var.mongodb_server_count_aggregator}(${var.mongodb_server_count_timeframe}):
|
||||||
sum:mongodb.replset.health${module.filter-tags.query_alert} by {replset_name}
|
sum:mongodb.replset.health${module.filter-tags.query_alert} by {replset_name}
|
||||||
> 99
|
> 99
|
||||||
@ -97,7 +98,7 @@ resource "datadog_monitor" "mongodb_replication" {
|
|||||||
message = coalesce(var.mongodb_replication_message, var.message)
|
message = coalesce(var.mongodb_replication_message, var.message)
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.mongodb_replication_aggregator}(${var.mongodb_replication_timeframe}):
|
${var.mongodb_replication_aggregator}(${var.mongodb_replication_timeframe}):
|
||||||
avg:mongodb.replset.replicationlag${module.filter-tags-secondary.query_alert} by {server} > ${var.mongodb_lag_critical}
|
avg:mongodb.replset.replicationlag${module.filter-tags-secondary.query_alert} by {server} > ${var.mongodb_lag_critical}
|
||||||
EOQ
|
EOQ
|
||||||
|
|||||||
@ -69,7 +69,7 @@ resource "datadog_monitor" "mysql_aborted" {
|
|||||||
message = coalesce(var.mysql_aborted_message, var.message)
|
message = coalesce(var.mysql_aborted_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.mysql_aborted_time_aggregator}(${var.mysql_aborted_timeframe}): (
|
${var.mysql_aborted_time_aggregator}(${var.mysql_aborted_timeframe}): (
|
||||||
avg:mysql.net.aborted_connects${module.filter-tags.query_alert} by {server} /
|
avg:mysql.net.aborted_connects${module.filter-tags.query_alert} by {server} /
|
||||||
avg:mysql.performance.threads_connected${module.filter-tags.query_alert} by {server}
|
avg:mysql.performance.threads_connected${module.filter-tags.query_alert} by {server}
|
||||||
@ -102,7 +102,7 @@ resource "datadog_monitor" "mysql_slow" {
|
|||||||
message = coalesce(var.mysql_slow_message, var.message)
|
message = coalesce(var.mysql_slow_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.mysql_slow_time_aggregator}(${var.mysql_slow_timeframe}): (
|
${var.mysql_slow_time_aggregator}(${var.mysql_slow_timeframe}): (
|
||||||
avg:mysql.performance.slow_queries${module.filter-tags.query_alert} by {server} /
|
avg:mysql.performance.slow_queries${module.filter-tags.query_alert} by {server} /
|
||||||
avg:mysql.performance.queries${module.filter-tags.query_alert} by {server}
|
avg:mysql.performance.queries${module.filter-tags.query_alert} by {server}
|
||||||
@ -202,7 +202,7 @@ resource "datadog_monitor" "mysql_threads_anomaly" {
|
|||||||
message = coalesce(var.mysql_threads_message, var.message)
|
message = coalesce(var.mysql_threads_message, var.message)
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.mysql_threads_time_aggregator}(${var.mysql_threads_timeframe}):
|
${var.mysql_threads_time_aggregator}(${var.mysql_threads_timeframe}):
|
||||||
anomalies(
|
anomalies(
|
||||||
avg:mysql.performance.threads_running${module.filter-tags.query_alert} by {server},
|
avg:mysql.performance.threads_running${module.filter-tags.query_alert} by {server},
|
||||||
@ -248,7 +248,7 @@ resource "datadog_monitor" "mysql_questions_anomaly" {
|
|||||||
message = coalesce(var.mysql_questions_message, var.message)
|
message = coalesce(var.mysql_questions_message, var.message)
|
||||||
type = "metric alert"
|
type = "metric alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.mysql_questions_time_aggregator}(${var.mysql_questions_timeframe}):
|
${var.mysql_questions_time_aggregator}(${var.mysql_questions_timeframe}):
|
||||||
anomalies(
|
anomalies(
|
||||||
avg:mysql.performance.questions${module.filter-tags.query_alert} by {server},
|
avg:mysql.performance.questions${module.filter-tags.query_alert} by {server},
|
||||||
|
|||||||
@ -68,7 +68,7 @@ resource "datadog_monitor" "postgresql_too_many_locks" {
|
|||||||
message = coalesce(var.postgresql_lock_message, var.message)
|
message = coalesce(var.postgresql_lock_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.postgresql_lock_time_aggregator}(${var.postgresql_lock_timeframe}):
|
${var.postgresql_lock_time_aggregator}(${var.postgresql_lock_timeframe}):
|
||||||
default(avg:postgresql.locks${module.filter-tags.query_alert} by {server}, 0)
|
default(avg:postgresql.locks${module.filter-tags.query_alert} by {server}, 0)
|
||||||
> ${var.postgresql_lock_threshold_critical}
|
> ${var.postgresql_lock_threshold_critical}
|
||||||
|
|||||||
@ -73,7 +73,7 @@ resource "datadog_monitor" "expirations" {
|
|||||||
message = coalesce(var.expirations_rate_message, var.message)
|
message = coalesce(var.expirations_rate_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.expirations_rate_time_aggregator}(${var.expirations_rate_timeframe}): (
|
${var.expirations_rate_time_aggregator}(${var.expirations_rate_timeframe}): (
|
||||||
avg:redis.expires.percent${module.filter-tags.query_alert} by {redis_host,redis_port}
|
avg:redis.expires.percent${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
) > ${var.expirations_rate_threshold_critical}
|
) > ${var.expirations_rate_threshold_critical}
|
||||||
@ -107,7 +107,7 @@ resource "datadog_monitor" "blocked_clients" {
|
|||||||
message = coalesce(var.blocked_clients_message, var.message)
|
message = coalesce(var.blocked_clients_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.blocked_clients_time_aggregator}(${var.blocked_clients_timeframe}): (
|
${var.blocked_clients_time_aggregator}(${var.blocked_clients_timeframe}): (
|
||||||
sum:redis.clients.blocked${module.filter-tags.query_alert} by {redis_host,redis_port}
|
sum:redis.clients.blocked${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
/ sum:redis.net.clients${module.filter-tags.query_alert} by {redis_host,redis_port}
|
/ sum:redis.net.clients${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
@ -211,7 +211,7 @@ resource "datadog_monitor" "memory_frag" {
|
|||||||
message = coalesce(var.mem_frag_message, var.message)
|
message = coalesce(var.mem_frag_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.mem_frag_time_aggregator}(${var.mem_frag_timeframe}):
|
${var.mem_frag_time_aggregator}(${var.mem_frag_timeframe}):
|
||||||
avg:redis.mem.fragmentation_ratio${module.filter-tags.query_alert} by {redis_host,redis_port}
|
avg:redis.mem.fragmentation_ratio${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
* 100 > ${var.mem_frag_threshold_critical}
|
* 100 > ${var.mem_frag_threshold_critical}
|
||||||
@ -245,7 +245,7 @@ resource "datadog_monitor" "rejected_connections" {
|
|||||||
message = coalesce(var.rejected_con_message, var.message)
|
message = coalesce(var.rejected_con_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
change(${var.rejected_con_time_aggregator}(${var.rejected_con_timeframe}),${var.rejected_con_timeframe}): (
|
change(${var.rejected_con_time_aggregator}(${var.rejected_con_timeframe}),${var.rejected_con_timeframe}): (
|
||||||
avg:redis.net.rejected${module.filter-tags.query_alert} by {redis_host,redis_port}
|
avg:redis.net.rejected${module.filter-tags.query_alert} by {redis_host,redis_port}
|
||||||
) > ${var.rejected_con_threshold_critical}
|
) > ${var.rejected_con_threshold_critical}
|
||||||
|
|||||||
48
scripts/00_requirements.sh
Executable file
48
scripts/00_requirements.sh
Executable file
@ -0,0 +1,48 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -u
|
||||||
|
|
||||||
|
source "$(dirname $0)/utils.sh"
|
||||||
|
goto_root
|
||||||
|
|
||||||
|
function check_command() {
|
||||||
|
local cmd="$1"
|
||||||
|
if ! command -v ${cmd} > /dev/null 2>&1; then
|
||||||
|
echo "This requires ${cmd} command, please install it first."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
function verlte() {
|
||||||
|
[ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
|
||||||
|
}
|
||||||
|
|
||||||
|
function verlt() {
|
||||||
|
[ "$1" = "$2" ] && return 1 || verlte $1 $2
|
||||||
|
}
|
||||||
|
|
||||||
|
function check_version() {
|
||||||
|
if [[ "$1" == "terraform" ]]; then
|
||||||
|
tmp_dir=$(mktemp -d)
|
||||||
|
cd ${tmp_dir}
|
||||||
|
cur_ver=$(terraform version | head -n 1 | cut -d' ' -f2)
|
||||||
|
cur_ver=${cur_ver#"v"}
|
||||||
|
cd - > /dev/null
|
||||||
|
rm -fr ${tmp_dir}
|
||||||
|
req_ver=$(grep required_version README.md | awk '{print $4}')
|
||||||
|
req_ver=${req_ver%'"'}
|
||||||
|
elif [[ "$1" == "terraform-docs" ]]; then
|
||||||
|
req_ver="0.6.0"
|
||||||
|
cur_ver=$(terraform-docs --version)
|
||||||
|
else
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
if ! verlte $req_ver $cur_ver; then
|
||||||
|
echo "This requires at least version ${req_ver} of $1, please upgrade (current version is ${cur_ver})"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
for cmd in terraform terraform-docs; do
|
||||||
|
check_command $cmd
|
||||||
|
check_version $cmd
|
||||||
|
done
|
||||||
@ -31,11 +31,7 @@ EOF
|
|||||||
terraform init ${dir}
|
terraform init ${dir}
|
||||||
terraform validate ${dir}
|
terraform validate ${dir}
|
||||||
rm -f ${dir}/tmp.tf
|
rm -f ${dir}/tmp.tf
|
||||||
# hack to work around bug https://github.com/hashicorp/terraform/issues/21434
|
|
||||||
# TODO when fixed, remove this bloc and add "terraform fmt -recursive" to the end of this file
|
|
||||||
for file in $(grep ' = <<E' ${dir}/* | cut -d':' -f1 | sort | uniq); do
|
|
||||||
sed -Ei '/<<EO(Q|F)/,/EO(Q|F)/ s/^#*/#/' ${file}
|
|
||||||
terraform fmt ${dir}
|
|
||||||
sed -Ei '/<<EO(Q|F)/,/EO(Q|F)/ s/^[[:space:]]*#//' ${file}
|
|
||||||
done
|
|
||||||
done
|
done
|
||||||
|
|
||||||
|
terraform fmt -recursive
|
||||||
|
|
||||||
|
|||||||
@ -70,7 +70,7 @@ resource "datadog_monitor" "disk_space" {
|
|||||||
message = coalesce(var.disk_space_message, var.message)
|
message = coalesce(var.disk_space_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.disk_space_time_aggregator}(${var.disk_space_timeframe}):
|
${var.disk_space_time_aggregator}(${var.disk_space_timeframe}):
|
||||||
avg:system.disk.in_use${module.filter-tags-disk.query_alert} by {host,device}
|
avg:system.disk.in_use${module.filter-tags-disk.query_alert} by {host,device}
|
||||||
* 100 > ${var.disk_space_threshold_critical}
|
* 100 > ${var.disk_space_threshold_critical}
|
||||||
@ -103,21 +103,14 @@ resource "datadog_monitor" "disk_space_forecast" {
|
|||||||
message = coalesce(var.disk_space_forecast_message, var.message)
|
message = coalesce(var.disk_space_forecast_message, var.message)
|
||||||
type = "query alert"
|
type = "query alert"
|
||||||
|
|
||||||
query = <<EOQ
|
query = <<EOQ
|
||||||
${var.disk_space_forecast_time_aggregator}(${var.disk_space_forecast_timeframe}):
|
${var.disk_space_forecast_time_aggregator}(${var.disk_space_forecast_timeframe}):
|
||||||
forecast(avg:system.disk.in_use${module.filter-tags-disk.query_alert} by {host,device} * 100,
|
forecast(avg:system.disk.in_use${module.filter-tags-disk.query_alert} by {host,device} * 100,
|
||||||
'${var.disk_space_forecast_algorithm}',
|
'${var.disk_space_forecast_algorithm}',
|
||||||
${var.disk_space_forecast_deviations},
|
${var.disk_space_forecast_deviations},
|
||||||
interval='${var.disk_space_forecast_interval}',
|
interval='${var.disk_space_forecast_interval}',
|
||||||
${var.disk_space_forecast_algorithm == "linear" ? format(
|
${var.disk_space_forecast_algorithm == "linear" ? format("history='%s',model='%s'", var.disk_space_forecast_linear_history, var.disk_space_forecast_linear_model) : ""}
|
||||||
"history='%s',model='%s'",
|
${var.disk_space_forecast_algorithm == "seasonal" ? format("seasonality='%s'", var.disk_space_forecast_seasonal_seasonality) : ""}
|
||||||
var.disk_space_forecast_linear_history,
|
|
||||||
var.disk_space_forecast_linear_model,
|
|
||||||
) : ""}
|
|
||||||
${var.disk_space_forecast_algorithm == "seasonal" ? format(
|
|
||||||
"seasonality='%s'",
|
|
||||||
var.disk_space_forecast_seasonal_seasonality,
|
|
||||||
) : ""}
|
|
||||||
)
|
)
|
||||||
>= ${var.disk_space_forecast_threshold_critical}
|
>= ${var.disk_space_forecast_threshold_critical}
|
||||||
EOQ
|
EOQ
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user