diff --git a/caas/kubernetes/ingress/inputs.tf b/caas/kubernetes/ingress/inputs.tf index 35c39b6..5ca28df 100644 --- a/caas/kubernetes/ingress/inputs.tf +++ b/caas/kubernetes/ingress/inputs.tf @@ -53,6 +53,12 @@ variable "ingress_5xx_message" { default = "" } +variable "ingress_5xx_time_aggregator" { + description = "Monitor aggregator for Ingress 5xx errors [available values: min, max or avg]" + type = "string" + default = "min" +} + variable "ingress_5xx_timeframe" { description = "Monitor timeframe for Ingress 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" type = "string" @@ -94,6 +100,12 @@ variable "ingress_4xx_message" { default = "" } +variable "ingress_4xx_time_aggregator" { + description = "Monitor aggregator for Ingress 4xx errors [available values: min, max or avg]" + type = "string" + default = "min" +} + variable "ingress_4xx_timeframe" { description = "Monitor timeframe for Ingress 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" type = "string" diff --git a/caas/kubernetes/ingress/monitors-ingress.tf b/caas/kubernetes/ingress/monitors-ingress.tf index ab44196..5be3c67 100644 --- a/caas/kubernetes/ingress/monitors-ingress.tf +++ b/caas/kubernetes/ingress/monitors-ingress.tf @@ -4,12 +4,10 @@ resource "datadog_monitor" "nginx_ingress_too_many_5xx" { message = "${coalesce(var.ingress_5xx_message, var.message)}" query = < ${var.ingress_5xx_threshold_critical} + ${var.ingress_5xx_time_aggregator}(${var.ingress_5xx_timeframe}): ( + default(avg:nginx_ingress.nginx_upstream_responses_total{module.filter-tags-5xx.query_alert} by {upstream,ingress_class}.as_rate(), 0) / ( + default(avg:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class}.as_rate() + ${var.artificial_requests_count}, 1)) + * 100 > ${var.ingress_5xx_threshold_critical} EOF type = "metric alert" @@ -40,12 +38,10 @@ resource "datadog_monitor" "nginx_ingress_too_many_4xx" { message = "${coalesce(var.ingress_4xx_message, var.message)}" query = < ${var.ingress_4xx_threshold_critical} + ${var.ingress_4xx_time_aggregator}(${var.ingress_4xx_timeframe}): ( + default(avg:nginx_ingress.nginx_upstream_responses_total{module.filter-tags-4xx.query_alert} by {upstream,ingress_class}.as_rate(), 0) / ( + default(avg:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class}.as_rate() + ${var.artificial_requests_count}, 1)) + * 100 > ${var.ingress_4xx_threshold_critical} EOF type = "metric alert" diff --git a/cloud/aws/alb/monitors-alb.tf b/cloud/aws/alb/monitors-alb.tf index fcbf956..72b7099 100644 --- a/cloud/aws/alb/monitors-alb.tf +++ b/cloud/aws/alb/monitors-alb.tf @@ -71,7 +71,7 @@ resource "datadog_monitor" "ALB_httpcode_5xx" { query = < ${var.httpcode_alb_5xx_threshold_critical} EOF @@ -103,7 +103,7 @@ resource "datadog_monitor" "ALB_httpcode_4xx" { query = < ${var.httpcode_alb_4xx_threshold_critical} EOF @@ -135,7 +135,7 @@ resource "datadog_monitor" "ALB_httpcode_target_5xx" { query = < ${var.httpcode_target_5xx_threshold_critical} EOF @@ -167,7 +167,7 @@ resource "datadog_monitor" "ALB_httpcode_target_4xx" { query = < ${var.httpcode_target_4xx_threshold_critical} EOF diff --git a/cloud/aws/apigateway/monitors-api.tf b/cloud/aws/apigateway/monitors-api.tf index 499e111..c3c5218 100644 --- a/cloud/aws/apigateway/monitors-api.tf +++ b/cloud/aws/apigateway/monitors-api.tf @@ -40,7 +40,7 @@ resource "datadog_monitor" "API_http_5xx_errors_count" { query = < ${var.http_5xx_requests_threshold_critical} EOF @@ -73,7 +73,7 @@ resource "datadog_monitor" "API_http_4xx_errors_count" { query = < ${var.http_4xx_requests_threshold_critical} EOF diff --git a/cloud/aws/elasticache/memcached/monitors-memcached.tf b/cloud/aws/elasticache/memcached/monitors-memcached.tf index aa8e65e..6ff12c2 100644 --- a/cloud/aws/elasticache/memcached/monitors-memcached.tf +++ b/cloud/aws/elasticache/memcached/monitors-memcached.tf @@ -8,7 +8,7 @@ resource "datadog_monitor" "memcached_get_hits" { query = < ${var.elb_4xx_threshold_critical} EOF @@ -74,8 +74,8 @@ resource "datadog_monitor" "ELB_too_much_5xx" { query = < ${var.elb_5xx_threshold_critical} EOF @@ -108,8 +108,8 @@ resource "datadog_monitor" "ELB_too_much_4xx_backend" { query = < ${var.elb_backend_4xx_threshold_critical} EOF @@ -142,8 +142,8 @@ resource "datadog_monitor" "ELB_too_much_5xx_backend" { query = < ${var.elb_backend_5xx_threshold_critical} EOF diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index 17cbe8f..aa4ed12 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -137,7 +137,7 @@ resource "datadog_monitor" "appservices_http_success_status_rate" { query = < ${var.cosmos_db_4xx_request_rate_threshold_critical} EOF @@ -100,8 +100,8 @@ resource "datadog_monitor" "cosmos_db_5xx_requests" { default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "503")} by {resource_group,region,name,collectionname}.as_rate(), 0) + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "500")} by {resource_group,region,name,collectionname}.as_rate(), 0) + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "503")} by {resource_group,region,name,collectionname}.as_rate(), 0) ) / ( - default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 1) + - default(sum:azure.documentdb_databaseaccounts.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 1) ) + default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.documentdb_databaseaccounts.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 0) ) ) * 100 > ${var.cosmos_db_5xx_request_rate_threshold_critical} EOF @@ -138,8 +138,8 @@ resource "datadog_monitor" "cosmos_db_scaling" { ${var.cosmos_db_scaling_time_aggregator}(${var.cosmos_db_scaling_timeframe}): ( ( default(sum:azure.cosmosdb.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,collectionname}.as_rate(), 0) + default(sum:azure.documentdb_databaseaccounts.total_requests${format(module.filter-tags-statuscode.query_alert, "429")} by {resource_group,region,name,collectionname}.as_rate(), 0) ) / ( - default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 1) + - default(sum:azure.documentdb_databaseaccounts.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 1) ) + default(sum:azure.cosmosdb.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 0) + + default(sum:azure.documentdb_databaseaccounts.total_requests${module.filter-tags.query_alert} by {resource_group,region,name,collectionname}.as_rate(), 0) ) ) * 100 > ${var.cosmos_db_scaling_error_rate_threshold_critical} EOF diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index ff6c1dd..9c86bb6 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -7,7 +7,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { ${var.failed_jobs_rate_time_aggregator}(${var.failed_jobs_rate_timeframe}):( default(avg:azure.devices_iothubs.jobs.failed{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / ( default(avg:azure.devices_iothubs.jobs.failed{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + - default(avg:azure.devices_iothubs.jobs.completed{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 1) ) + default(avg:azure.devices_iothubs.jobs.completed{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) ) ) * 100 > ${var.failed_jobs_rate_threshold_critical} EOF @@ -41,7 +41,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { query = < ${var.failed_listjobs_rate_threshold_critical} EOF @@ -76,7 +76,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { query = < ${var.failed_queryjobs_rate_threshold_critical} EOF @@ -168,7 +168,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { ${var.failed_c2d_methods_rate_time_aggregator}(${var.failed_c2d_methods_rate_timeframe}):( default(avg:azure.devices_iothubs.c2d.methods.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / ( default(avg:azure.devices_iothubs.c2d.methods.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + - default(avg:azure.devices_iothubs.c2d.methods.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 1) ) + default(avg:azure.devices_iothubs.c2d.methods.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) ) ) * 100 > ${var.failed_c2d_methods_rate_threshold_critical} EOF @@ -203,7 +203,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { ${var.failed_c2d_twin_read_rate_time_aggregator}(${var.failed_c2d_twin_read_rate_timeframe}):( default(avg:azure.devices_iothubs.c2d.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / ( default(avg:azure.devices_iothubs.c2d.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + - default(avg:azure.devices_iothubs.c2d.twin.read.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 1) ) + default(avg:azure.devices_iothubs.c2d.twin.read.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) ) ) * 100 > ${var.failed_c2d_twin_read_rate_threshold_critical} EOF @@ -238,7 +238,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { ${var.failed_c2d_twin_update_rate_time_aggregator}(${var.failed_c2d_twin_update_rate_timeframe}):( default(avg:azure.devices_iothubs.c2d.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / ( default(avg:azure.devices_iothubs.c2d.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + - default(avg:azure.devices_iothubs.c2d.twin.update.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 1) ) + default(avg:azure.devices_iothubs.c2d.twin.update.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) ) ) * 100 > ${var.failed_c2d_twin_update_rate_threshold_critical} EOF @@ -273,7 +273,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { ${var.failed_d2c_twin_read_rate_time_aggregator}(${var.failed_d2c_twin_read_rate_timeframe}):( default(avg:azure.devices_iothubs.d2c.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / ( default(avg:azure.devices_iothubs.d2c.twin.read.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + - default(avg:azure.devices_iothubs.d2c.twin.read.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 1) ) + default(avg:azure.devices_iothubs.d2c.twin.read.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) ) ) * 100 > ${var.failed_d2c_twin_read_rate_threshold_critical} EOF @@ -308,7 +308,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { ${var.failed_d2c_twin_update_rate_time_aggregator}(${var.failed_d2c_twin_update_rate_timeframe}):( default(avg:azure.devices_iothubs.d2c.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) / ( default(avg:azure.devices_iothubs.d2c.twin.update.failure{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + - default(avg:azure.devices_iothubs.d2c.twin.update.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 1) ) + default(avg:azure.devices_iothubs.d2c.twin.update.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) ) ) * 100 > ${var.failed_d2c_twin_update_rate_threshold_critical} EOF @@ -345,7 +345,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + - default(avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 1) ) + default(avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) ) ) * 100 > ${var.dropped_d2c_telemetry_egress_rate_threshold_critical} EOF @@ -382,7 +382,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + - default(avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 1) ) + default(avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) ) ) * 100 > ${var.orphaned_d2c_telemetry_egress_rate_threshold_critical} EOF @@ -419,7 +419,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { default(avg:azure.devices_iothubs.d2c.telemetry.egress.dropped{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + default(avg:azure.devices_iothubs.d2c.telemetry.egress.orphaned{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + default(avg:azure.devices_iothubs.d2c.telemetry.egress.invalid{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) + - default(avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 1) ) + default(avg:azure.devices_iothubs.d2c.telemetry.egress.success{${var.filter_tags}} by {resource_group,region,name}.as_rate(), 0) ) ) * 100 > ${var.invalid_d2c_telemetry_egress_rate_threshold_critical} EOF diff --git a/cloud/gcp/lb/monitors-lb.tf b/cloud/gcp/lb/monitors-lb.tf index 8e2b2d3..ebeb84e 100644 --- a/cloud/gcp/lb/monitors-lb.tf +++ b/cloud/gcp/lb/monitors-lb.tf @@ -11,8 +11,8 @@ resource "datadog_monitor" "error_rate_4xx" { query = < ${var.error_rate_4xx_threshold_critical} + default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags}} by {forwarding_rule_name}.as_rate() + ${var.error_rate_4xx_artificial_request}, 1)) + * 100 > ${var.error_rate_4xx_threshold_critical} EOF thresholds { @@ -49,8 +49,8 @@ resource "datadog_monitor" "error_rate_5xx" { query = < ${var.error_rate_5xx_threshold_critical} + default(sum:gcp.loadbalancing.https.request_count{${var.filter_tags}} by {forwarding_rule_name}.as_rate() + ${var.error_rate_5xx_artificial_request}, 1)) + * 100 > ${var.error_rate_5xx_threshold_critical} EOF thresholds {