From 5737fe5c2b7ff84a669885ddb8cdb60ac8c485d4 Mon Sep 17 00:00:00 2001 From: Alexandre Gaillet Date: Fri, 16 Mar 2018 16:23:35 +0100 Subject: [PATCH 1/2] MON-135 - Updated requests queries for Event Hub monitors --- cloud/azure/eventhub/monitors-eventhub.tf | 35 ++++++++++++----------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/cloud/azure/eventhub/monitors-eventhub.tf b/cloud/azure/eventhub/monitors-eventhub.tf index c96d967..4f9f4ed 100644 --- a/cloud/azure/eventhub/monitors-eventhub.tf +++ b/cloud/azure/eventhub/monitors-eventhub.tf @@ -38,11 +38,11 @@ resource "datadog_monitor" "eventhub_failed_requests" { query = < ${var.failed_requests_rate_thresold_critical} + default( + avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() / + avg:azure.eventhub_namespaces.incoming_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count(), + 0) * 100 + ) > ${var.failed_requests_rate_thresold_critical} EOF type = "metric alert" @@ -58,7 +58,7 @@ resource "datadog_monitor" "eventhub_failed_requests" { evaluation_delay = "${var.delay}" renotify_interval = 0 notify_audit = false - timeout_h = 1 + timeout_h = 0 include_tags = true locked = false require_full_window = false @@ -69,20 +69,21 @@ resource "datadog_monitor" "eventhub_failed_requests" { } resource "datadog_monitor" "eventhub_errors" { - name = "[${var.environment}] Event Hub too manny errors {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" + name = "[${var.environment}] Event Hub too many errors {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)" message = "${coalesce(var.errors_rate_message, var.message)}" query = < ${var.errors_rate_thresold_critical} + default( + ( + avg:azure.eventhub_namespaces.internal_server_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + + avg:azure.eventhub_namespaces.server_busy_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + + avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + ) / ( + avg:eventhub_namespaces.incoming_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() + ), + 0) * 100 + ) > ${var.errors_rate_thresold_critical} EOF type = "metric alert" @@ -98,7 +99,7 @@ resource "datadog_monitor" "eventhub_errors" { evaluation_delay = "${var.delay}" renotify_interval = 0 notify_audit = false - timeout_h = 1 + timeout_h = 0 include_tags = true locked = false require_full_window = false From 18453462a8fc25f7a7c98bbea6f9f5ee7b64d754 Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Fri, 16 Mar 2018 16:27:38 +0100 Subject: [PATCH 2/2] MON-137 Use min as aggregate for Azure app services response time monitor --- cloud/azure/app-services/monitors-app_services.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/app-services/monitors-app_services.tf b/cloud/azure/app-services/monitors-app_services.tf index c501bdf..2bfc36d 100644 --- a/cloud/azure/app-services/monitors-app_services.tf +++ b/cloud/azure/app-services/monitors-app_services.tf @@ -13,7 +13,7 @@ resource "datadog_monitor" "appservices_response_time" { message = "${coalesce(var.response_time_message, var.message)}" query = < ${var.response_time_threshold_critical} EOF