From dd09e1aedc53c1c51c2e23ac05eef8906c90b43a Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 13 Sep 2019 09:55:40 +0200 Subject: [PATCH 1/3] MON-512 fix queries for elasticsearch latencies monitors --- .../elasticsearch/monitors-elasticsearch.tf | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/database/elasticsearch/monitors-elasticsearch.tf b/database/elasticsearch/monitors-elasticsearch.tf index 740f20d..6f512ed 100644 --- a/database/elasticsearch/monitors-elasticsearch.tf +++ b/database/elasticsearch/monitors-elasticsearch.tf @@ -329,7 +329,10 @@ resource "datadog_monitor" "jvm_gc_old_collection_latency" { query = < ${var.jvm_gc_old_collection_latency_threshold_critical} EOQ @@ -364,7 +367,10 @@ resource "datadog_monitor" "jvm_gc_young_collection_latency" { query = < ${var.jvm_gc_young_collection_latency_threshold_critical} EOQ @@ -400,7 +406,10 @@ resource "datadog_monitor" "indexing_latency" { // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.indexing_latency_threshold_critical} EOQ @@ -436,7 +445,10 @@ resource "datadog_monitor" "flush_latency" { // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.flush_latency_threshold_critical} EOQ @@ -520,7 +532,10 @@ resource "datadog_monitor" "search_query_latency" { // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.search_query_latency_threshold_critical} EOQ @@ -556,7 +571,10 @@ resource "datadog_monitor" "fetch_latency" { // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.fetch_latency_threshold_critical} EOQ From 6b628ccdf6803e22d3e180561b621b6c9474f079 Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 13 Sep 2019 11:16:04 +0200 Subject: [PATCH 2/3] MON-512 adapt thresholds and timeframes --- database/elasticsearch/inputs.tf | 32 +++++++++---------- .../elasticsearch/monitors-elasticsearch.tf | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/database/elasticsearch/inputs.tf b/database/elasticsearch/inputs.tf index 11e6dfe..b06cc3f 100644 --- a/database/elasticsearch/inputs.tf +++ b/database/elasticsearch/inputs.tf @@ -418,19 +418,19 @@ variable "jvm_gc_old_collection_latency_time_aggregator" { variable "jvm_gc_old_collection_latency_timeframe" { description = "Timeframe for the Cluster Status monitor" type = string - default = "last_10m" + default = "last_15m" } variable "jvm_gc_old_collection_latency_threshold_warning" { description = "Cluster Status warning threshold" type = string - default = 160 + default = 200 } variable "jvm_gc_old_collection_latency_threshold_critical" { description = "Cluster Status critical threshold" type = string - default = 200 + default = 300 } variable "jvm_gc_old_collection_latency_enabled" { @@ -463,7 +463,7 @@ variable "jvm_gc_young_collection_latency_time_aggregator" { variable "jvm_gc_young_collection_latency_timeframe" { description = "Timeframe for the Cluster Status monitor" type = string - default = "last_10m" + default = "last_15m" } variable "jvm_gc_young_collection_latency_threshold_warning" { @@ -475,7 +475,7 @@ variable "jvm_gc_young_collection_latency_threshold_warning" { variable "jvm_gc_young_collection_latency_threshold_critical" { description = "Cluster Status critical threshold" type = string - default = 25 + default = 40 } variable "jvm_gc_young_collection_latency_enabled" { @@ -514,13 +514,13 @@ variable "indexing_latency_timeframe" { variable "indexing_latency_threshold_warning" { description = "Cluster Status warning threshold" type = string - default = 10 + default = 15 } variable "indexing_latency_threshold_critical" { description = "Cluster Status critical threshold" type = string - default = 15 + default = 30 } variable "indexing_latency_enabled" { @@ -553,19 +553,19 @@ variable "flush_latency_time_aggregator" { variable "flush_latency_timeframe" { description = "Timeframe for the Cluster Status monitor" type = string - default = "last_10m" + default = "last_15m" } variable "flush_latency_threshold_warning" { description = "Cluster Status warning threshold" type = string - default = 50 + default = 100 } variable "flush_latency_threshold_critical" { description = "Cluster Status critical threshold" type = string - default = 100 + default = 150 } variable "flush_latency_enabled" { @@ -685,19 +685,19 @@ variable "search_query_latency_time_aggregator" { variable "search_query_latency_timeframe" { description = "Timeframe for the Cluster Status monitor" type = string - default = "last_10m" + default = "last_15m" } variable "search_query_latency_threshold_warning" { description = "Cluster Status warning threshold" type = string - default = 0.5 + default = 10 } variable "search_query_latency_threshold_critical" { description = "Cluster Status critical threshold" type = string - default = 1 + default = 20 } variable "search_query_latency_enabled" { @@ -730,19 +730,19 @@ variable "fetch_latency_time_aggregator" { variable "fetch_latency_timeframe" { description = "Timeframe for the Cluster Status monitor" type = string - default = "last_10m" + default = "last_15m" } variable "fetch_latency_threshold_warning" { description = "Cluster Status warning threshold" type = string - default = 2 + default = 10 } variable "fetch_latency_threshold_critical" { description = "Cluster Status critical threshold" type = string - default = 4 + default = 20 } variable "fetch_latency_enabled" { diff --git a/database/elasticsearch/monitors-elasticsearch.tf b/database/elasticsearch/monitors-elasticsearch.tf index 6f512ed..242f0e2 100644 --- a/database/elasticsearch/monitors-elasticsearch.tf +++ b/database/elasticsearch/monitors-elasticsearch.tf @@ -399,7 +399,7 @@ EOQ # resource "datadog_monitor" "indexing_latency" { count = var.indexing_latency_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average indexing time by document {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average indexing latency by document {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" message = coalesce(var.indexing_latency_message, var.message) type = "query alert" From 45dfde159a9f85fd7b7686f89e5bd6a1705b685c Mon Sep 17 00:00:00 2001 From: Quentin Manfroi Date: Fri, 13 Sep 2019 11:24:17 +0200 Subject: [PATCH 3/3] MON-512 auto update --- database/elasticsearch/README.md | 34 ++++++++++++++++---------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/database/elasticsearch/README.md b/database/elasticsearch/README.md index 14115b2..72307f6 100644 --- a/database/elasticsearch/README.md +++ b/database/elasticsearch/README.md @@ -17,7 +17,7 @@ module "datadog-monitors-database-elasticsearch" { Creates DataDog monitors with the following checks: - Elasticsearch average index flushing to disk latency -- Elasticsearch average indexing time by document +- Elasticsearch average indexing latency by document - Elasticsearch average Old-generation garbage collections latency - Elasticsearch average search fetch latency - Elasticsearch average search query latency @@ -84,10 +84,10 @@ Creates DataDog monitors with the following checks: | fetch\_latency\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no | | fetch\_latency\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no | | fetch\_latency\_message | Custom message for the Cluster Status monitor | string | `""` | no | -| fetch\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"4"` | no | -| fetch\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"2"` | no | +| fetch\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"20"` | no | +| fetch\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"10"` | no | | fetch\_latency\_time\_aggregator | Time aggregator for the Cluster Status monitor | string | `"min"` | no | -| fetch\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_10m"` | no | +| fetch\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_15m"` | no | | field\_data\_evictions\_change\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no | | field\_data\_evictions\_change\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no | | field\_data\_evictions\_change\_message | Custom message for the Cluster Status monitor | string | `""` | no | @@ -102,10 +102,10 @@ Creates DataDog monitors with the following checks: | flush\_latency\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no | | flush\_latency\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no | | flush\_latency\_message | Custom message for the Cluster Status monitor | string | `""` | no | -| flush\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"100"` | no | -| flush\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"50"` | no | +| flush\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"150"` | no | +| flush\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"100"` | no | | flush\_latency\_time\_aggregator | Time aggregator for the Cluster Status monitor | string | `"avg"` | no | -| flush\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_10m"` | no | +| flush\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_15m"` | no | | http\_connections\_anomaly\_alert\_window | Alert window. | string | `"last_15m"` | no | | http\_connections\_anomaly\_count\_default\_zero | Count default zero. | string | `"true"` | no | | http\_connections\_anomaly\_detection\_algorithm | Anomaly Detection Algorithm used | string | `"agile"` | no | @@ -123,24 +123,24 @@ Creates DataDog monitors with the following checks: | indexing\_latency\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no | | indexing\_latency\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no | | indexing\_latency\_message | Custom message for the Cluster Status monitor | string | `""` | no | -| indexing\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"15"` | no | -| indexing\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"10"` | no | +| indexing\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"30"` | no | +| indexing\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"15"` | no | | indexing\_latency\_time\_aggregator | Time aggregator for the Cluster Status monitor | string | `"avg"` | no | | indexing\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_10m"` | no | | jvm\_gc\_old\_collection\_latency\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no | | jvm\_gc\_old\_collection\_latency\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no | | jvm\_gc\_old\_collection\_latency\_message | Custom message for the Cluster Status monitor | string | `""` | no | -| jvm\_gc\_old\_collection\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"200"` | no | -| jvm\_gc\_old\_collection\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"160"` | no | +| jvm\_gc\_old\_collection\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"300"` | no | +| jvm\_gc\_old\_collection\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"200"` | no | | jvm\_gc\_old\_collection\_latency\_time\_aggregator | Time aggregator for the Cluster Status monitor | string | `"avg"` | no | -| jvm\_gc\_old\_collection\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_10m"` | no | +| jvm\_gc\_old\_collection\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_15m"` | no | | jvm\_gc\_young\_collection\_latency\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no | | jvm\_gc\_young\_collection\_latency\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no | | jvm\_gc\_young\_collection\_latency\_message | Custom message for the Cluster Status monitor | string | `""` | no | -| jvm\_gc\_young\_collection\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"25"` | no | +| jvm\_gc\_young\_collection\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"40"` | no | | jvm\_gc\_young\_collection\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"20"` | no | | jvm\_gc\_young\_collection\_latency\_time\_aggregator | Time aggregator for the Cluster Status monitor | string | `"avg"` | no | -| jvm\_gc\_young\_collection\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_10m"` | no | +| jvm\_gc\_young\_collection\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_15m"` | no | | jvm\_heap\_memory\_usage\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no | | jvm\_heap\_memory\_usage\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no | | jvm\_heap\_memory\_usage\_message | Custom message for the Cluster Status monitor | string | `""` | no | @@ -204,10 +204,10 @@ Creates DataDog monitors with the following checks: | search\_query\_latency\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no | | search\_query\_latency\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no | | search\_query\_latency\_message | Custom message for the Cluster Status monitor | string | `""` | no | -| search\_query\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"1"` | no | -| search\_query\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"0.5"` | no | +| search\_query\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"20"` | no | +| search\_query\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"10"` | no | | search\_query\_latency\_time\_aggregator | Time aggregator for the Cluster Status monitor | string | `"avg"` | no | -| search\_query\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_10m"` | no | +| search\_query\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_15m"` | no | | task\_time\_in\_queue\_change\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no | | task\_time\_in\_queue\_change\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no | | task\_time\_in\_queue\_change\_message | Custom message for the Cluster Status monitor | string | `""` | no |