diff --git a/cloud/aws/alb/README.md b/cloud/aws/alb/README.md index 9aa2189..ba7a03f 100644 --- a/cloud/aws/alb/README.md +++ b/cloud/aws/alb/README.md @@ -44,6 +44,7 @@ Creates DataDog monitors with the following checks: | httpcode_alb_4xx_silenced | Groups to mute for ALB httpcode 4xx monitor | map | `{}` | no | | httpcode_alb_4xx_threshold_critical | loadbalancer 4xx critical threshold in percentage | string | `80` | no | | httpcode_alb_4xx_threshold_warning | loadbalancer 4xx warning threshold in percentage | string | `60` | no | +| httpcode_alb_4xx_time_aggregator | Monitor aggregator for ALB httpcode 4xx [available values: min, max or avg] | string | `min` | no | | httpcode_alb_4xx_timeframe | Monitor timeframe for ALB httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | httpcode_alb_5xx_enabled | Flag to enable ALB httpcode 5xx monitor | string | `true` | no | | httpcode_alb_5xx_extra_tags | Extra tags for ALB httpcode 5xx monitor | list | `[]` | no | @@ -51,6 +52,7 @@ Creates DataDog monitors with the following checks: | httpcode_alb_5xx_silenced | Groups to mute for ALB httpcode 5xx monitor | map | `{}` | no | | httpcode_alb_5xx_threshold_critical | loadbalancer 5xx critical threshold in percentage | string | `80` | no | | httpcode_alb_5xx_threshold_warning | loadbalancer 5xx warning threshold in percentage | string | `60` | no | +| httpcode_alb_5xx_time_aggregator | Monitor aggregator for ALB httpcode 5xx [available values: min, max or avg] | string | `min` | no | | httpcode_alb_5xx_timeframe | Monitor timeframe for ALB httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | httpcode_target_4xx_enabled | Flag to enable ALB target httpcode 4xx monitor | string | `true` | no | | httpcode_target_4xx_extra_tags | Extra tags for ALB target httpcode 4xx monitor | list | `[]` | no | @@ -58,6 +60,7 @@ Creates DataDog monitors with the following checks: | httpcode_target_4xx_silenced | Groups to mute for ALB target httpcode 4xx monitor | map | `{}` | no | | httpcode_target_4xx_threshold_critical | target 4xx critical threshold in percentage | string | `80` | no | | httpcode_target_4xx_threshold_warning | target 4xx warning threshold in percentage | string | `60` | no | +| httpcode_target_4xx_time_aggregator | Monitor aggregator for ALB target httpcode 4xx [available values: min, max or avg] | string | `min` | no | | httpcode_target_4xx_timeframe | Monitor timeframe for ALB target httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | httpcode_target_5xx_enabled | Flag to enable ALB target httpcode 5xx monitor | string | `true` | no | | httpcode_target_5xx_extra_tags | Extra tags for ALB target httpcode 5xx monitor | list | `[]` | no | @@ -65,6 +68,7 @@ Creates DataDog monitors with the following checks: | httpcode_target_5xx_silenced | Groups to mute for ALB target httpcode 5xx monitor | map | `{}` | no | | httpcode_target_5xx_threshold_critical | target 5xx critical threshold in percentage | string | `80` | no | | httpcode_target_5xx_threshold_warning | target 5xx warning threshold in percentage | string | `60` | no | +| httpcode_target_5xx_time_aggregator | Monitor aggregator for ALB target httpcode 5xx [available values: min, max or avg] | string | `min` | no | | httpcode_target_5xx_timeframe | Monitor timeframe for ALB target httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | latency_enabled | Flag to enable ALB latency monitor | string | `true` | no | | latency_extra_tags | Extra tags for ALB latency monitor | list | `[]` | no | diff --git a/cloud/aws/alb/inputs.tf b/cloud/aws/alb/inputs.tf index 721274a..ae6d74b 100644 --- a/cloud/aws/alb/inputs.tf +++ b/cloud/aws/alb/inputs.tf @@ -137,6 +137,12 @@ variable "httpcode_alb_4xx_message" { default = "" } +variable "httpcode_alb_4xx_time_aggregator" { + description = "Monitor aggregator for ALB httpcode 4xx [available values: min, max or avg]" + type = "string" + default = "min" +} + variable "httpcode_alb_4xx_timeframe" { description = "Monitor timeframe for ALB httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" type = "string" @@ -177,6 +183,12 @@ variable "httpcode_target_4xx_message" { default = "" } +variable "httpcode_target_4xx_time_aggregator" { + description = "Monitor aggregator for ALB target httpcode 4xx [available values: min, max or avg]" + type = "string" + default = "min" +} + variable "httpcode_target_4xx_timeframe" { description = "Monitor timeframe for ALB target httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" type = "string" @@ -217,6 +229,12 @@ variable "httpcode_alb_5xx_message" { default = "" } +variable "httpcode_alb_5xx_time_aggregator" { + description = "Monitor aggregator for ALB httpcode 5xx [available values: min, max or avg]" + type = "string" + default = "min" +} + variable "httpcode_alb_5xx_timeframe" { description = "Monitor timeframe for ALB httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" type = "string" @@ -257,6 +275,12 @@ variable "httpcode_target_5xx_message" { default = "" } +variable "httpcode_target_5xx_time_aggregator" { + description = "Monitor aggregator for ALB target httpcode 5xx [available values: min, max or avg]" + type = "string" + default = "min" +} + variable "httpcode_target_5xx_timeframe" { description = "Monitor timeframe for ALB target httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" type = "string" diff --git a/cloud/aws/alb/monitors-alb.tf b/cloud/aws/alb/monitors-alb.tf index 43f6103..fcbf956 100644 --- a/cloud/aws/alb/monitors-alb.tf +++ b/cloud/aws/alb/monitors-alb.tf @@ -69,9 +69,9 @@ resource "datadog_monitor" "ALB_httpcode_5xx" { message = "${coalesce(var.httpcode_alb_5xx_message, var.message)}" query = < ${var.httpcode_alb_5xx_threshold_critical} EOF @@ -101,9 +101,9 @@ resource "datadog_monitor" "ALB_httpcode_4xx" { message = "${coalesce(var.httpcode_alb_4xx_message, var.message)}" query = < ${var.httpcode_alb_4xx_threshold_critical} EOF @@ -133,9 +133,9 @@ resource "datadog_monitor" "ALB_httpcode_target_5xx" { message = "${coalesce(var.httpcode_target_5xx_message, var.message)}" query = < ${var.httpcode_target_5xx_threshold_critical} EOF @@ -165,9 +165,9 @@ resource "datadog_monitor" "ALB_httpcode_target_4xx" { message = "${coalesce(var.httpcode_target_4xx_message, var.message)}" query = < ${var.httpcode_target_4xx_threshold_critical} EOF diff --git a/cloud/aws/apigateway/README.md b/cloud/aws/apigateway/README.md index 3b2cd51..34f1761 100644 --- a/cloud/aws/apigateway/README.md +++ b/cloud/aws/apigateway/README.md @@ -34,6 +34,7 @@ Creates DataDog monitors with the following checks: | http_4xx_requests_silenced | Groups to mute for API Gateway HTTP 4xx requests monitor | map | `{}` | no | | http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no | | http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no | +| http_4xx_requests_time_aggregator | Monitor aggregator for API HTTP 4xx requests [available values: min, max or avg] | string | `min` | no | | http_4xx_requests_timeframe | Monitor timeframe for API HTTP 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | http_5xx_requests_enabled | Flag to enable API Gateway HTTP 5xx requests monitor | string | `true` | no | | http_5xx_requests_extra_tags | Extra tags for API Gateway HTTP 5xx requests monitor | list | `[]` | no | @@ -41,6 +42,7 @@ Creates DataDog monitors with the following checks: | http_5xx_requests_silenced | Groups to mute for API Gateway HTTP 5xx requests monitor | map | `{}` | no | | http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no | | http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no | +| http_5xx_requests_time_aggregator | Monitor aggregator for API HTTP 5xx requests [available values: min, max or avg] | string | `min` | no | | http_5xx_requests_timeframe | Monitor timeframe for API HTTP 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | latency_enabled | Flag to enable API Gateway latency monitor | string | `true` | no | | latency_extra_tags | Extra tags for API Gateway latency monitor | list | `[]` | no | diff --git a/cloud/aws/apigateway/inputs.tf b/cloud/aws/apigateway/inputs.tf index 810080d..43ff6fa 100644 --- a/cloud/aws/apigateway/inputs.tf +++ b/cloud/aws/apigateway/inputs.tf @@ -100,6 +100,12 @@ variable "http_5xx_requests_message" { default = "" } +variable "http_5xx_requests_time_aggregator" { + description = "Monitor aggregator for API HTTP 5xx requests [available values: min, max or avg]" + type = "string" + default = "min" +} + variable "http_5xx_requests_timeframe" { description = "Monitor timeframe for API HTTP 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" type = "string" @@ -144,6 +150,12 @@ variable "http_4xx_requests_message" { default = "" } +variable "http_4xx_requests_time_aggregator" { + description = "Monitor aggregator for API HTTP 4xx requests [available values: min, max or avg]" + type = "string" + default = "min" +} + variable "http_4xx_requests_timeframe" { description = "Monitor timeframe for API HTTP 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" type = "string" diff --git a/cloud/aws/apigateway/monitors-api.tf b/cloud/aws/apigateway/monitors-api.tf index 859892b..499e111 100644 --- a/cloud/aws/apigateway/monitors-api.tf +++ b/cloud/aws/apigateway/monitors-api.tf @@ -38,9 +38,9 @@ resource "datadog_monitor" "API_http_5xx_errors_count" { message = "${coalesce(var.http_5xx_requests_message, var.message)}" query = < ${var.http_5xx_requests_threshold_critical} EOF @@ -71,9 +71,9 @@ resource "datadog_monitor" "API_http_4xx_errors_count" { message = "${coalesce(var.http_4xx_requests_message, var.message)}" query = < ${var.http_4xx_requests_threshold_critical} EOF diff --git a/cloud/aws/elasticache/memcached/README.md b/cloud/aws/elasticache/memcached/README.md index 982eb38..4db761e 100644 --- a/cloud/aws/elasticache/memcached/README.md +++ b/cloud/aws/elasticache/memcached/README.md @@ -41,6 +41,7 @@ Creates DataDog monitors with the following checks: | get_hits_silenced | Groups to mute for Elasticache memcached get hits monitor | map | `{}` | no | | get_hits_threshold_critical | Elasticache memcached get hits critical threshold in percentage | string | `60` | no | | get_hits_threshold_warning | Elasticache memcached get hits warning threshold in percentage | string | `80` | no | +| get_hits_time_aggregator | Monitor aggregator for Elasticache memcached get hits [available values: min, max or avg] | string | `max` | no | | get_hits_timeframe | Monitor timeframe for Elasticache memcached get hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_15m` | no | | message | Message sent when an alert is triggered | string | - | yes | | new_host_delay | Delay in seconds before monitor new resource | string | `300` | no | diff --git a/cloud/aws/elasticache/memcached/inputs.tf b/cloud/aws/elasticache/memcached/inputs.tf index a07c61a..e629181 100644 --- a/cloud/aws/elasticache/memcached/inputs.tf +++ b/cloud/aws/elasticache/memcached/inputs.tf @@ -54,6 +54,12 @@ variable "get_hits_message" { default = "" } +variable "get_hits_time_aggregator" { + description = "Monitor aggregator for Elasticache memcached get hits [available values: min, max or avg]" + type = "string" + default = "max" +} + variable "get_hits_timeframe" { description = "Monitor timeframe for Elasticache memcached get hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" type = "string" diff --git a/cloud/aws/elasticache/memcached/monitors-memcached.tf b/cloud/aws/elasticache/memcached/monitors-memcached.tf index a375726..aa8e65e 100644 --- a/cloud/aws/elasticache/memcached/monitors-memcached.tf +++ b/cloud/aws/elasticache/memcached/monitors-memcached.tf @@ -6,10 +6,10 @@ resource "datadog_monitor" "memcached_get_hits" { type = "metric alert" query = < ${var.failed_requests_threshold_critical} + min(${var.failed_requests_timeframe}): ( + default(avg:azure.apimanagement_service.failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / + default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1) + ) * 100 > ${var.failed_requests_threshold_critical} EOF thresholds { @@ -67,10 +67,10 @@ resource "datadog_monitor" "apimgt_other_requests" { message = "${coalesce(var.other_requests_message, var.message)}" query = < ${var.other_requests_threshold_critical} + min(${var.other_requests_timeframe}): ( + default(avg:azure.apimanagement_service.other_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / + default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1) + ) * 100 > ${var.other_requests_threshold_critical} EOF thresholds { @@ -100,10 +100,10 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" { message = "${coalesce(var.unauthorized_requests_message, var.message)}" query = < ${var.unauthorized_requests_threshold_critical} + min(${var.unauthorized_requests_timeframe}): ( + default(avg:azure.apimanagement_service.unauthorized_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / + default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1) + ) * 100 > ${var.unauthorized_requests_threshold_critical} EOF thresholds { @@ -133,10 +133,10 @@ resource "datadog_monitor" "apimgt_successful_requests" { message = "${coalesce(var.successful_requests_message, var.message)}" query = < ${var.http_5xx_requests_threshold_critical} EOF @@ -103,9 +103,9 @@ resource "datadog_monitor" "appservices_http_4xx_errors_count" { message = "${coalesce(var.http_4xx_requests_message, var.message)}" query = < ${var.http_4xx_requests_threshold_critical} EOF @@ -136,10 +136,10 @@ resource "datadog_monitor" "appservices_http_success_status_rate" { message = "${coalesce(var.http_successful_requests_message, var.message)}" query = < ${var.cosmos_db_4xx_request_rate_threshold_critical} - EOF + ${var.cosmos_db_4xx_request_time_aggregator}(${var.cosmos_db_4xx_request_timeframe}): ( ( + default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "400")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "401")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "403")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "404")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "408")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "409")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "412")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "413")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "449")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "400")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "401")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "403")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "404")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "408")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "409")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "412")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "413")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "449")} by {resource_group,region,name,collectionname}.as_rate(), 0) ) / ( + default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 1) + + default(sum:azure.documentdb_databaseaccounts.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 1) ) + ) * 100 > ${var.cosmos_db_4xx_request_rate_threshold_critical} + EOF type = "metric alert" @@ -99,19 +95,15 @@ resource "datadog_monitor" "cosmos_db_5xx_requests" { message = "${coalesce(var.cosmos_db_5xx_requests_message, var.message)}" query = < ${var.cosmos_db_5xx_request_rate_threshold_critical} - EOF + ${var.cosmos_db_5xx_request_time_aggregator}(${var.cosmos_db_5xx_request_timeframe}): ( ( + default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "500")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "503")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "500")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "503")} by {resource_group,region,name,collectionname}.as_rate(), 0) ) / ( + default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 1) + + default(sum:azure.documentdb_databaseaccounts.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 1) ) + ) * 100 > ${var.cosmos_db_5xx_request_rate_threshold_critical} + EOF type = "metric alert" @@ -143,17 +135,13 @@ resource "datadog_monitor" "cosmos_db_scaling" { # List of available status codes : https://docs.microsoft.com/en-us/rest/api/cosmos-db/http-status-codes-for-cosmosdb query = < ${var.cosmos_db_scaling_error_rate_threshold_critical} - EOF + ${var.cosmos_db_scaling_time_aggregator}(${var.cosmos_db_scaling_timeframe}): ( ( + default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,collectionname}.as_rate(), 0) ) / ( + default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 1) + + default(sum:azure.documentdb_databaseaccounts.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 1) ) + ) * 100 > ${var.cosmos_db_scaling_error_rate_threshold_critical} + EOF type = "metric alert" diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf index d2118fe..876a645 100644 --- a/cloud/azure/eventhub/monitors-eventhub.tf +++ b/cloud/azure/eventhub/monitors-eventhub.tf @@ -32,12 +32,10 @@ resource "datadog_monitor" "eventhub_failed_requests" { message = "${coalesce(var.failed_requests_rate_message, var.message)}" query = < ${var.failed_requests_rate_thresold_critical} + min(${var.failed_requests_rate_timeframe}): ( + default(avg:azure.eventhub_namespaces.failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / + default(avg:azure.eventhub_namespaces.incoming_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1) + ) * 100 > ${var.failed_requests_rate_thresold_critical} EOF type = "metric alert" @@ -68,17 +66,12 @@ resource "datadog_monitor" "eventhub_errors" { message = "${coalesce(var.errors_rate_message, var.message)}" query = < ${var.errors_rate_thresold_critical} + min(${var.errors_rate_timeframe}): ( ( + default(avg:azure.eventhub_namespaces.internal_server_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.eventhub_namespaces.server_busy_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.eventhub_namespaces.other_errors${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) ) / + default(avg:azure.eventhub_namespaces.incoming_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1) + ) * 100 > ${var.errors_rate_thresold_critical} EOF type = "metric alert" diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 27cd822..9818f67 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -4,11 +4,11 @@ resource "datadog_monitor" "too_many_jobs_failed" { message = "${coalesce(var.failed_jobs_rate_message, var.message)}" query = < ${var.failed_jobs_rate_threshold_critical} + min(${var.failed_jobs_rate_timeframe}):( + default(avg:azure.devices_iothubs.jobs.failed{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.jobs.failed{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.jobs.completed{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 1) ) + ) * 100 > ${var.failed_jobs_rate_threshold_critical} EOF type = "metric alert" @@ -39,11 +39,11 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { message = "${coalesce(var.failed_listjobs_rate_message, var.message)}" query = < ${var.failed_listjobs_rate_threshold_critical} + min(${var.failed_listjobs_rate_timeframe}):( + default(avg:azure.devices_iothubs.jobs.list_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.jobs.list_jobs.success{${var.filter_tags}} by {resource_group,name}.as_rate(), 1) + + default(avg:azure.devices_iothubs.jobs.list_jobs.failure{${var.filter_tags}} by {resource_group,name}.as_rate(), 0) ) + ) * 100 > ${var.failed_listjobs_rate_threshold_critical} EOF type = "metric alert" @@ -74,10 +74,10 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { message = "${coalesce(var.failed_queryjobs_rate_message, var.message)}" query = < ${var.failed_queryjobs_rate_threshold_critical} EOF @@ -165,10 +165,10 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { message = "${coalesce(var.failed_c2d_methods_rate_message, var.message)}" query = < ${var.failed_c2d_methods_rate_threshold_critical} EOF @@ -200,10 +200,10 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { message = "${coalesce(var.failed_c2d_twin_read_rate_message, var.message)}" query = < ${var.failed_c2d_twin_read_rate_threshold_critical} EOF @@ -235,10 +235,10 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { message = "${coalesce(var.failed_c2d_twin_update_rate_message, var.message)}" query = < ${var.failed_c2d_twin_update_rate_threshold_critical} EOF @@ -270,10 +270,10 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { message = "${coalesce(var.failed_d2c_twin_read_rate_message, var.message)}" query = < ${var.failed_d2c_twin_read_rate_threshold_critical} EOF @@ -305,10 +305,10 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { message = "${coalesce(var.failed_d2c_twin_update_rate_message, var.message)}" query = < ${var.failed_d2c_twin_update_rate_threshold_critical} EOF @@ -340,14 +340,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { message = "${coalesce(var.dropped_d2c_telemetry_egress_message, var.message)}" query = < ${var.dropped_d2c_telemetry_egress_rate_threshold_critical} + min(${var.dropped_d2c_telemetry_egress_timeframe}): ( + default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 1) ) + ) * 100 > ${var.dropped_d2c_telemetry_egress_rate_threshold_critical} EOF type = "metric alert" @@ -378,14 +377,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { message = "${coalesce(var.orphaned_d2c_telemetry_egress_message, var.message)}" query = < ${var.orphaned_d2c_telemetry_egress_rate_threshold_critical} + min(${var.orphaned_d2c_telemetry_egress_timeframe}): ( + default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 1) ) + ) * 100 > ${var.orphaned_d2c_telemetry_egress_rate_threshold_critical} EOF type = "metric alert" @@ -416,14 +414,13 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { message = "${coalesce(var.invalid_d2c_telemetry_egress_message, var.message)}" query = < ${var.invalid_d2c_telemetry_egress_rate_threshold_critical} + min(${var.invalid_d2c_telemetry_egress_timeframe}): ( + default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / ( + default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + + default(avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 1) ) + ) * 100 > ${var.invalid_d2c_telemetry_egress_rate_threshold_critical} EOF type = "metric alert" @@ -456,7 +453,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { query = < 0 EOF diff --git a/cloud/azure/keyvault/README.md b/cloud/azure/keyvault/README.md index 28b5d8c..46e1757 100644 --- a/cloud/azure/keyvault/README.md +++ b/cloud/azure/keyvault/README.md @@ -38,7 +38,7 @@ Creates DataDog monitors with the following checks: | api_result_silenced | Groups to mute for Key Vault API result monitor | map | `{}` | no | | api_result_threshold_critical | Critical threshold for Key Vault API result rate | string | `10` | no | | api_result_threshold_warning | Warning threshold for Key Vault API result rate | string | `30` | no | -| api_result_time_aggregator | Monitor aggregator for Key Vault API result [available values: min, max or avg] | string | `sum` | no | +| api_result_time_aggregator | Monitor aggregator for Key Vault API result [available values: min, max or avg] | string | `max` | no | | api_result_timeframe | Monitor timeframe for Key Vault API result [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no | | environment | Architecture environment | string | - | yes | | evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no | diff --git a/cloud/azure/keyvault/inputs.tf b/cloud/azure/keyvault/inputs.tf index fa03318..2ef5361 100644 --- a/cloud/azure/keyvault/inputs.tf +++ b/cloud/azure/keyvault/inputs.tf @@ -84,7 +84,7 @@ variable "api_result_message" { variable "api_result_time_aggregator" { description = "Monitor aggregator for Key Vault API result [available values: min, max or avg]" type = "string" - default = "sum" + default = "max" } variable "api_result_timeframe" { diff --git a/cloud/azure/keyvault/monitors-keyvault.tf b/cloud/azure/keyvault/monitors-keyvault.tf index 1847517..9f87421 100644 --- a/cloud/azure/keyvault/monitors-keyvault.tf +++ b/cloud/azure/keyvault/monitors-keyvault.tf @@ -35,8 +35,8 @@ resource "datadog_monitor" "keyvault_api_result" { query = < ${var.user_errors_threshold_critical} + min(${var.user_errors_timeframe}): ( + default(avg:azure.servicebus_namespaces.user_errors.preview${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / + default(avg:azure.servicebus_namespaces.incoming_requests_preview${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1) + ) * 100 > ${var.user_errors_threshold_critical} EOF type = "metric alert" @@ -99,11 +98,10 @@ resource "datadog_monitor" "service_bus_server_errors" { message = "${coalesce(var.server_errors_message, var.message)}" query = < ${var.server_errors_threshold_critical} + min(${var.server_errors_timeframe}): ( + default(avg:azure.servicebus_namespaces.server_errors.preview${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / + default(avg:azure.servicebus_namespaces.incoming_requests_preview${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1) + ) * 100 > ${var.server_errors_threshold_critical} EOF type = "metric alert" diff --git a/cloud/azure/stream-analytics/monitors-stream-analytics.tf b/cloud/azure/stream-analytics/monitors-stream-analytics.tf index 4ea2efa..b13ba29 100644 --- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf +++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf @@ -65,9 +65,9 @@ resource "datadog_monitor" "failed_function_requests" { message = "${coalesce(var.failed_function_requests_message, var.message)}" query = < ${var.failed_function_requests_threshold_critical} EOF diff --git a/cloud/gcp/lb/README.md b/cloud/gcp/lb/README.md index 35b5873..f8ceae0 100644 --- a/cloud/gcp/lb/README.md +++ b/cloud/gcp/lb/README.md @@ -50,7 +50,7 @@ Creates DataDog monitors with the following checks: | error_rate_4xx_silenced | Groups to mute for GCP LB 4XX Errors monitor | map | `{}` | no | | error_rate_4xx_threshold_critical | Rate error in percentage (critical threshold) | string | `60` | no | | error_rate_4xx_threshold_warning | Rate error in percentage (warning threshold) | string | `50` | no | -| error_rate_4xx_time_aggregator | Timeframe for the GCP LB 4XX Errors monitor | string | `sum` | no | +| error_rate_4xx_time_aggregator | Timeframe for the GCP LB 4XX Errors monitor | string | `min` | no | | error_rate_4xx_timeframe | Timeframe for the GCP LB 4XX Errors monitor | string | `last_5m` | no | | error_rate_5xx_artificial_request | Divisor Delta for the GCP LB 5XX Errors monitor | string | `5` | no | | error_rate_5xx_enabled | Flag to enable GCP LB 5XX Errors monitor | string | `true` | no | @@ -59,7 +59,7 @@ Creates DataDog monitors with the following checks: | error_rate_5xx_silenced | Groups to mute for GCP LB 5XX Errors monitor | map | `{}` | no | | error_rate_5xx_threshold_critical | Rate error in percentage (critical threshold) | string | `40` | no | | error_rate_5xx_threshold_warning | Rate error in percentage (warning threshold) | string | `30` | no | -| error_rate_5xx_time_aggregator | Timeframe for the GCP LB 5XX Errors monitor | string | `sum` | no | +| error_rate_5xx_time_aggregator | Timeframe for the GCP LB 5XX Errors monitor | string | `min` | no | | error_rate_5xx_timeframe | Timeframe for the GCP LB 5XX Errors monitor | string | `last_5m` | no | | evaluation_delay | Delay in seconds for the metric evaluation | string | `900` | no | | filter_tags | Tags used for filtering | string | `*` | no | diff --git a/cloud/gcp/lb/inputs.tf b/cloud/gcp/lb/inputs.tf index 1080372..e81beed 100644 --- a/cloud/gcp/lb/inputs.tf +++ b/cloud/gcp/lb/inputs.tf @@ -37,7 +37,7 @@ variable "error_rate_4xx_message" { variable "error_rate_4xx_time_aggregator" { description = "Timeframe for the GCP LB 4XX Errors monitor" type = "string" - default = "sum" + default = "min" } variable "error_rate_4xx_timeframe" { @@ -94,7 +94,7 @@ variable "error_rate_5xx_message" { variable "error_rate_5xx_time_aggregator" { description = "Timeframe for the GCP LB 5XX Errors monitor" type = "string" - default = "sum" + default = "min" } variable "error_rate_5xx_timeframe" { diff --git a/cloud/gcp/lb/monitors-lb.tf b/cloud/gcp/lb/monitors-lb.tf index f67b47a..8e2b2d3 100644 --- a/cloud/gcp/lb/monitors-lb.tf +++ b/cloud/gcp/lb/monitors-lb.tf @@ -10,9 +10,8 @@ resource "datadog_monitor" "error_rate_4xx" { query = < ${var.error_rate_4xx_threshold_critical} EOF @@ -49,9 +48,8 @@ resource "datadog_monitor" "error_rate_5xx" { query = < ${var.error_rate_5xx_threshold_critical} EOF