From 5cbf1bbdab01b43722e495484ca26360efe36e44 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 14 Jun 2019 15:00:06 +0200 Subject: [PATCH 1/4] MON-476 reduce no data alerts per set of monitors --- caas/kubernetes/node/monitors-k8s-node.tf | 4 +-- .../elasticsearch/monitors-elasticsearch.tf | 6 ++--- cloud/aws/rds/common/monitors-rds-common.tf | 4 +-- .../azure-search/monitors-azure-search.tf | 2 +- cloud/azure/iothubs/monitors-iothubs.tf | 26 +++++++++---------- .../common/monitors-cloud-sql-common.tf | 6 ++--- cloud/gcp/pubsub/monitors-pubsub.tf | 2 +- 7 files changed, 25 insertions(+), 25 deletions(-) diff --git a/caas/kubernetes/node/monitors-k8s-node.tf b/caas/kubernetes/node/monitors-k8s-node.tf index 997ee5d..f814bec 100644 --- a/caas/kubernetes/node/monitors-k8s-node.tf +++ b/caas/kubernetes/node/monitors-k8s-node.tf @@ -253,7 +253,7 @@ resource "datadog_monitor" "volume_space" { evaluation_delay = "${var.evaluation_delay}" new_host_delay = "${var.new_host_delay}" - notify_no_data = true + notify_no_data = false renotify_interval = 0 notify_audit = false timeout_h = 0 @@ -286,7 +286,7 @@ resource "datadog_monitor" "volume_inodes" { evaluation_delay = "${var.evaluation_delay}" new_host_delay = "${var.new_host_delay}" - notify_no_data = true + notify_no_data = false renotify_interval = 0 notify_audit = false timeout_h = 0 diff --git a/cloud/aws/elasticsearch/monitors-elasticsearch.tf b/cloud/aws/elasticsearch/monitors-elasticsearch.tf index 04f8f6f..ec19a29 100644 --- a/cloud/aws/elasticsearch/monitors-elasticsearch.tf +++ b/cloud/aws/elasticsearch/monitors-elasticsearch.tf @@ -22,7 +22,7 @@ resource "datadog_monitor" "es_cluster_status" { critical = 2 } - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false @@ -57,7 +57,7 @@ resource "datadog_monitor" "es_free_space_low" { critical = "${var.diskspace_threshold_critical}" } - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false @@ -91,7 +91,7 @@ resource "datadog_monitor" "es_cpu_90_15min" { critical = "${var.cpu_threshold_critical}" } - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false diff --git a/cloud/aws/rds/common/monitors-rds-common.tf b/cloud/aws/rds/common/monitors-rds-common.tf index ba001a7..c82c775 100644 --- a/cloud/aws/rds/common/monitors-rds-common.tf +++ b/cloud/aws/rds/common/monitors-rds-common.tf @@ -17,7 +17,7 @@ resource "datadog_monitor" "rds_cpu_90_15min" { critical = "${var.cpu_threshold_critical}" } - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" notify_audit = false timeout_h = 0 @@ -84,7 +84,7 @@ resource "datadog_monitor" "rds_replica_lag" { critical = "${var.replicalag_threshold_critical}" } - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" notify_audit = false timeout_h = 0 diff --git a/cloud/azure/azure-search/monitors-azure-search.tf b/cloud/azure/azure-search/monitors-azure-search.tf index 3228d3a..b69605a 100644 --- a/cloud/azure/azure-search/monitors-azure-search.tf +++ b/cloud/azure/azure-search/monitors-azure-search.tf @@ -53,7 +53,7 @@ resource "datadog_monitor" "azure_search_throttled_queries_rate" { silenced = "${var.throttled_queries_rate_silenced}" - notify_no_data = true # Will notify when no data is received + notify_no_data = false # Will notify when no data is received renotify_interval = 0 require_full_window = false timeout_h = 0 diff --git a/cloud/azure/iothubs/monitors-iothubs.tf b/cloud/azure/iothubs/monitors-iothubs.tf index 823611e..78efcbf 100644 --- a/cloud/azure/iothubs/monitors-iothubs.tf +++ b/cloud/azure/iothubs/monitors-iothubs.tf @@ -21,7 +21,7 @@ resource "datadog_monitor" "too_many_jobs_failed" { silenced = "${var.failed_jobs_rate_silenced}" - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false @@ -57,7 +57,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" { silenced = "${var.failed_listjobs_rate_silenced}" - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false @@ -93,7 +93,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" { silenced = "${var.failed_queryjobs_rate_silenced}" - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false @@ -149,7 +149,7 @@ resource "datadog_monitor" "total_devices" { silenced = "${var.total_devices_silenced}" - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false @@ -185,7 +185,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" { silenced = "${var.failed_c2d_methods_rate_silenced}" - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false @@ -221,7 +221,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" { silenced = "${var.failed_c2d_twin_read_rate_silenced}" - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false @@ -257,7 +257,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" { silenced = "${var.failed_c2d_twin_update_rate_silenced}" - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false @@ -293,7 +293,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" { silenced = "${var.failed_d2c_twin_read_rate_silenced}" - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false @@ -329,7 +329,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" { silenced = "${var.failed_d2c_twin_update_rate_silenced}" - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false @@ -367,7 +367,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" { silenced = "${var.dropped_d2c_telemetry_egress_silenced}" - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false @@ -405,7 +405,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" { silenced = "${var.orphaned_d2c_telemetry_egress_silenced}" - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false @@ -443,7 +443,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" { silenced = "${var.invalid_d2c_telemetry_egress_silenced}" - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false @@ -473,7 +473,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" { silenced = "${var.too_many_d2c_telemetry_ingress_nosent_silenced}" - notify_no_data = true + notify_no_data = false evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false diff --git a/cloud/gcp/cloud-sql/common/monitors-cloud-sql-common.tf b/cloud/gcp/cloud-sql/common/monitors-cloud-sql-common.tf index 7b2755b..83290f5 100644 --- a/cloud/gcp/cloud-sql/common/monitors-cloud-sql-common.tf +++ b/cloud/gcp/cloud-sql/common/monitors-cloud-sql-common.tf @@ -25,7 +25,7 @@ resource "datadog_monitor" "cpu_utilization" { timeout_h = 0 include_tags = true require_full_window = false - notify_no_data = true + notify_no_data = false renotify_interval = 0 evaluation_delay = "${var.evaluation_delay}" @@ -145,7 +145,7 @@ resource "datadog_monitor" "memory_utilization" { timeout_h = 0 include_tags = true require_full_window = false - notify_no_data = true + notify_no_data = false renotify_interval = 0 evaluation_delay = "${var.evaluation_delay}" @@ -226,7 +226,7 @@ resource "datadog_monitor" "failover_unavailable" { timeout_h = 0 include_tags = true require_full_window = false - notify_no_data = true + notify_no_data = false renotify_interval = 0 evaluation_delay = "${var.evaluation_delay}" diff --git a/cloud/gcp/pubsub/monitors-pubsub.tf b/cloud/gcp/pubsub/monitors-pubsub.tf index d9e472b..4fd7a5b 100644 --- a/cloud/gcp/pubsub/monitors-pubsub.tf +++ b/cloud/gcp/pubsub/monitors-pubsub.tf @@ -60,7 +60,7 @@ resource "datadog_monitor" "unavailable_sending_operations_count" { timeout_h = 0 include_tags = true require_full_window = false - notify_no_data = true + notify_no_data = false renotify_interval = 0 evaluation_delay = "${var.evaluation_delay}" From da324c10b3aad37c75e2ae920adb8bd34ee9968b Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 14 Jun 2019 15:12:34 +0200 Subject: [PATCH 2/4] MON-476 fmt and add one no data per set as best practice in ci --- cloud/azure/azure-search/monitors-azure-search.tf | 2 +- scripts/90_best_practices.sh | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) create mode 100755 scripts/90_best_practices.sh diff --git a/cloud/azure/azure-search/monitors-azure-search.tf b/cloud/azure/azure-search/monitors-azure-search.tf index b69605a..0dea51a 100644 --- a/cloud/azure/azure-search/monitors-azure-search.tf +++ b/cloud/azure/azure-search/monitors-azure-search.tf @@ -53,7 +53,7 @@ resource "datadog_monitor" "azure_search_throttled_queries_rate" { silenced = "${var.throttled_queries_rate_silenced}" - notify_no_data = false # Will notify when no data is received + notify_no_data = false # Will notify when no data is received renotify_interval = 0 require_full_window = false timeout_h = 0 diff --git a/scripts/90_best_practices.sh b/scripts/90_best_practices.sh new file mode 100755 index 0000000..12be77b --- /dev/null +++ b/scripts/90_best_practices.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -xueo pipefail + +source "$(dirname $0)/utils.sh" +goto_root + +# loop over every monitors set +for path in $(find "$(get_scope $1)" -path ./incubator -prune -o -name 'monitors-*.tf' -print | sort -fdbi); do + # check if there is more than 1 notify_no_data parameter set to true per set of monitors + if [[ $(grep -c notify_no_data.*true $path) -gt 1 ]]; then + echo "More than one notify_no_data set to true on $path" + exit 1 + fi +done From a2aecaf3f2d5e4bdb1836211b92287e5b879aae3 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 14 Jun 2019 17:44:31 +0200 Subject: [PATCH 3/4] MON-476 keep one no notify no data for elasticsearch --- cloud/aws/elasticsearch/monitors-elasticsearch.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/aws/elasticsearch/monitors-elasticsearch.tf b/cloud/aws/elasticsearch/monitors-elasticsearch.tf index ec19a29..6ff5d78 100644 --- a/cloud/aws/elasticsearch/monitors-elasticsearch.tf +++ b/cloud/aws/elasticsearch/monitors-elasticsearch.tf @@ -22,7 +22,7 @@ resource "datadog_monitor" "es_cluster_status" { critical = 2 } - notify_no_data = false + notify_no_data = true evaluation_delay = "${var.evaluation_delay}" renotify_interval = 0 notify_audit = false From 94c20d32c1fea073360ee8021254ce5245a6ceae Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 14 Jun 2019 17:45:54 +0200 Subject: [PATCH 4/4] MON-476 fix comment for notify no data --- cloud/azure/azure-search/monitors-azure-search.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/azure/azure-search/monitors-azure-search.tf b/cloud/azure/azure-search/monitors-azure-search.tf index 0dea51a..c280d3b 100644 --- a/cloud/azure/azure-search/monitors-azure-search.tf +++ b/cloud/azure/azure-search/monitors-azure-search.tf @@ -21,7 +21,7 @@ resource "datadog_monitor" "azure_search_latency" { silenced = "${var.latency_silenced}" - notify_no_data = true # Will notify when no data is received + notify_no_data = true # Will not notify when no data is received renotify_interval = 0 require_full_window = false timeout_h = 0