Merge branch 'MON-512_fix_elasticsearch_latencies' into 'master'
Resolve MON-512 "Fix elasticsearch latencies" Closes MON-512 See merge request claranet/pt-monitoring/projects/datadog/terraform/monitors!123
This commit is contained in:
commit
800fe16141
@ -17,7 +17,7 @@ module "datadog-monitors-database-elasticsearch" {
|
||||
Creates DataDog monitors with the following checks:
|
||||
|
||||
- Elasticsearch average index flushing to disk latency
|
||||
- Elasticsearch average indexing time by document
|
||||
- Elasticsearch average indexing latency by document
|
||||
- Elasticsearch average Old-generation garbage collections latency
|
||||
- Elasticsearch average search fetch latency
|
||||
- Elasticsearch average search query latency
|
||||
@ -84,10 +84,10 @@ Creates DataDog monitors with the following checks:
|
||||
| fetch\_latency\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no |
|
||||
| fetch\_latency\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no |
|
||||
| fetch\_latency\_message | Custom message for the Cluster Status monitor | string | `""` | no |
|
||||
| fetch\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"4"` | no |
|
||||
| fetch\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"2"` | no |
|
||||
| fetch\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"20"` | no |
|
||||
| fetch\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"10"` | no |
|
||||
| fetch\_latency\_time\_aggregator | Time aggregator for the Cluster Status monitor | string | `"min"` | no |
|
||||
| fetch\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_10m"` | no |
|
||||
| fetch\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_15m"` | no |
|
||||
| field\_data\_evictions\_change\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no |
|
||||
| field\_data\_evictions\_change\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no |
|
||||
| field\_data\_evictions\_change\_message | Custom message for the Cluster Status monitor | string | `""` | no |
|
||||
@ -102,10 +102,10 @@ Creates DataDog monitors with the following checks:
|
||||
| flush\_latency\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no |
|
||||
| flush\_latency\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no |
|
||||
| flush\_latency\_message | Custom message for the Cluster Status monitor | string | `""` | no |
|
||||
| flush\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"100"` | no |
|
||||
| flush\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"50"` | no |
|
||||
| flush\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"150"` | no |
|
||||
| flush\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"100"` | no |
|
||||
| flush\_latency\_time\_aggregator | Time aggregator for the Cluster Status monitor | string | `"avg"` | no |
|
||||
| flush\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_10m"` | no |
|
||||
| flush\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_15m"` | no |
|
||||
| http\_connections\_anomaly\_alert\_window | Alert window. | string | `"last_15m"` | no |
|
||||
| http\_connections\_anomaly\_count\_default\_zero | Count default zero. | string | `"true"` | no |
|
||||
| http\_connections\_anomaly\_detection\_algorithm | Anomaly Detection Algorithm used | string | `"agile"` | no |
|
||||
@ -123,24 +123,24 @@ Creates DataDog monitors with the following checks:
|
||||
| indexing\_latency\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no |
|
||||
| indexing\_latency\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no |
|
||||
| indexing\_latency\_message | Custom message for the Cluster Status monitor | string | `""` | no |
|
||||
| indexing\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"15"` | no |
|
||||
| indexing\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"10"` | no |
|
||||
| indexing\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"30"` | no |
|
||||
| indexing\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"15"` | no |
|
||||
| indexing\_latency\_time\_aggregator | Time aggregator for the Cluster Status monitor | string | `"avg"` | no |
|
||||
| indexing\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_10m"` | no |
|
||||
| jvm\_gc\_old\_collection\_latency\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no |
|
||||
| jvm\_gc\_old\_collection\_latency\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no |
|
||||
| jvm\_gc\_old\_collection\_latency\_message | Custom message for the Cluster Status monitor | string | `""` | no |
|
||||
| jvm\_gc\_old\_collection\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"200"` | no |
|
||||
| jvm\_gc\_old\_collection\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"160"` | no |
|
||||
| jvm\_gc\_old\_collection\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"300"` | no |
|
||||
| jvm\_gc\_old\_collection\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"200"` | no |
|
||||
| jvm\_gc\_old\_collection\_latency\_time\_aggregator | Time aggregator for the Cluster Status monitor | string | `"avg"` | no |
|
||||
| jvm\_gc\_old\_collection\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_10m"` | no |
|
||||
| jvm\_gc\_old\_collection\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_15m"` | no |
|
||||
| jvm\_gc\_young\_collection\_latency\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no |
|
||||
| jvm\_gc\_young\_collection\_latency\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no |
|
||||
| jvm\_gc\_young\_collection\_latency\_message | Custom message for the Cluster Status monitor | string | `""` | no |
|
||||
| jvm\_gc\_young\_collection\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"25"` | no |
|
||||
| jvm\_gc\_young\_collection\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"40"` | no |
|
||||
| jvm\_gc\_young\_collection\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"20"` | no |
|
||||
| jvm\_gc\_young\_collection\_latency\_time\_aggregator | Time aggregator for the Cluster Status monitor | string | `"avg"` | no |
|
||||
| jvm\_gc\_young\_collection\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_10m"` | no |
|
||||
| jvm\_gc\_young\_collection\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_15m"` | no |
|
||||
| jvm\_heap\_memory\_usage\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no |
|
||||
| jvm\_heap\_memory\_usage\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no |
|
||||
| jvm\_heap\_memory\_usage\_message | Custom message for the Cluster Status monitor | string | `""` | no |
|
||||
@ -204,10 +204,10 @@ Creates DataDog monitors with the following checks:
|
||||
| search\_query\_latency\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no |
|
||||
| search\_query\_latency\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no |
|
||||
| search\_query\_latency\_message | Custom message for the Cluster Status monitor | string | `""` | no |
|
||||
| search\_query\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"1"` | no |
|
||||
| search\_query\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"0.5"` | no |
|
||||
| search\_query\_latency\_threshold\_critical | Cluster Status critical threshold | string | `"20"` | no |
|
||||
| search\_query\_latency\_threshold\_warning | Cluster Status warning threshold | string | `"10"` | no |
|
||||
| search\_query\_latency\_time\_aggregator | Time aggregator for the Cluster Status monitor | string | `"avg"` | no |
|
||||
| search\_query\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_10m"` | no |
|
||||
| search\_query\_latency\_timeframe | Timeframe for the Cluster Status monitor | string | `"last_15m"` | no |
|
||||
| task\_time\_in\_queue\_change\_enabled | Flag to enable Cluster Status monitor | string | `"true"` | no |
|
||||
| task\_time\_in\_queue\_change\_extra\_tags | Extra tags for Cluster Status monitor | list(string) | `[]` | no |
|
||||
| task\_time\_in\_queue\_change\_message | Custom message for the Cluster Status monitor | string | `""` | no |
|
||||
|
||||
@ -418,19 +418,19 @@ variable "jvm_gc_old_collection_latency_time_aggregator" {
|
||||
variable "jvm_gc_old_collection_latency_timeframe" {
|
||||
description = "Timeframe for the Cluster Status monitor"
|
||||
type = string
|
||||
default = "last_10m"
|
||||
default = "last_15m"
|
||||
}
|
||||
|
||||
variable "jvm_gc_old_collection_latency_threshold_warning" {
|
||||
description = "Cluster Status warning threshold"
|
||||
type = string
|
||||
default = 160
|
||||
default = 200
|
||||
}
|
||||
|
||||
variable "jvm_gc_old_collection_latency_threshold_critical" {
|
||||
description = "Cluster Status critical threshold"
|
||||
type = string
|
||||
default = 200
|
||||
default = 300
|
||||
}
|
||||
|
||||
variable "jvm_gc_old_collection_latency_enabled" {
|
||||
@ -463,7 +463,7 @@ variable "jvm_gc_young_collection_latency_time_aggregator" {
|
||||
variable "jvm_gc_young_collection_latency_timeframe" {
|
||||
description = "Timeframe for the Cluster Status monitor"
|
||||
type = string
|
||||
default = "last_10m"
|
||||
default = "last_15m"
|
||||
}
|
||||
|
||||
variable "jvm_gc_young_collection_latency_threshold_warning" {
|
||||
@ -475,7 +475,7 @@ variable "jvm_gc_young_collection_latency_threshold_warning" {
|
||||
variable "jvm_gc_young_collection_latency_threshold_critical" {
|
||||
description = "Cluster Status critical threshold"
|
||||
type = string
|
||||
default = 25
|
||||
default = 40
|
||||
}
|
||||
|
||||
variable "jvm_gc_young_collection_latency_enabled" {
|
||||
@ -514,13 +514,13 @@ variable "indexing_latency_timeframe" {
|
||||
variable "indexing_latency_threshold_warning" {
|
||||
description = "Cluster Status warning threshold"
|
||||
type = string
|
||||
default = 10
|
||||
default = 15
|
||||
}
|
||||
|
||||
variable "indexing_latency_threshold_critical" {
|
||||
description = "Cluster Status critical threshold"
|
||||
type = string
|
||||
default = 15
|
||||
default = 30
|
||||
}
|
||||
|
||||
variable "indexing_latency_enabled" {
|
||||
@ -553,19 +553,19 @@ variable "flush_latency_time_aggregator" {
|
||||
variable "flush_latency_timeframe" {
|
||||
description = "Timeframe for the Cluster Status monitor"
|
||||
type = string
|
||||
default = "last_10m"
|
||||
default = "last_15m"
|
||||
}
|
||||
|
||||
variable "flush_latency_threshold_warning" {
|
||||
description = "Cluster Status warning threshold"
|
||||
type = string
|
||||
default = 50
|
||||
default = 100
|
||||
}
|
||||
|
||||
variable "flush_latency_threshold_critical" {
|
||||
description = "Cluster Status critical threshold"
|
||||
type = string
|
||||
default = 100
|
||||
default = 150
|
||||
}
|
||||
|
||||
variable "flush_latency_enabled" {
|
||||
@ -685,19 +685,19 @@ variable "search_query_latency_time_aggregator" {
|
||||
variable "search_query_latency_timeframe" {
|
||||
description = "Timeframe for the Cluster Status monitor"
|
||||
type = string
|
||||
default = "last_10m"
|
||||
default = "last_15m"
|
||||
}
|
||||
|
||||
variable "search_query_latency_threshold_warning" {
|
||||
description = "Cluster Status warning threshold"
|
||||
type = string
|
||||
default = 0.5
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "search_query_latency_threshold_critical" {
|
||||
description = "Cluster Status critical threshold"
|
||||
type = string
|
||||
default = 1
|
||||
default = 20
|
||||
}
|
||||
|
||||
variable "search_query_latency_enabled" {
|
||||
@ -730,19 +730,19 @@ variable "fetch_latency_time_aggregator" {
|
||||
variable "fetch_latency_timeframe" {
|
||||
description = "Timeframe for the Cluster Status monitor"
|
||||
type = string
|
||||
default = "last_10m"
|
||||
default = "last_15m"
|
||||
}
|
||||
|
||||
variable "fetch_latency_threshold_warning" {
|
||||
description = "Cluster Status warning threshold"
|
||||
type = string
|
||||
default = 2
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "fetch_latency_threshold_critical" {
|
||||
description = "Cluster Status critical threshold"
|
||||
type = string
|
||||
default = 4
|
||||
default = 20
|
||||
}
|
||||
|
||||
variable "fetch_latency_enabled" {
|
||||
|
||||
@ -329,7 +329,10 @@ resource "datadog_monitor" "jvm_gc_old_collection_latency" {
|
||||
|
||||
query = <<EOQ
|
||||
${var.jvm_gc_old_collection_latency_time_aggregator}(${var.jvm_gc_old_collection_latency_timeframe}):
|
||||
avg:jvm.gc.collectors.old.collection_time${module.filter-tags.query_alert} by {node_name} / avg:jvm.gc.collectors.old.count${module.filter-tags.query_alert} by {node_name} * 1000
|
||||
default(
|
||||
diff(avg:jvm.gc.collectors.old.collection_time${module.filter-tags.query_alert} by {node_name}) /
|
||||
diff(avg:jvm.gc.collectors.old.count${module.filter-tags.query_alert} by {node_name})
|
||||
* 1000, 0)
|
||||
> ${var.jvm_gc_old_collection_latency_threshold_critical}
|
||||
EOQ
|
||||
|
||||
@ -364,7 +367,10 @@ resource "datadog_monitor" "jvm_gc_young_collection_latency" {
|
||||
|
||||
query = <<EOQ
|
||||
${var.jvm_gc_young_collection_latency_time_aggregator}(${var.jvm_gc_young_collection_latency_timeframe}):
|
||||
avg:jvm.gc.collectors.young.collection_time${module.filter-tags.query_alert} by {node_name} / avg:jvm.gc.collectors.young.count${module.filter-tags.query_alert} by {node_name} * 1000
|
||||
default(
|
||||
diff(avg:jvm.gc.collectors.young.collection_time${module.filter-tags.query_alert} by {node_name}) /
|
||||
diff(avg:jvm.gc.collectors.young.count${module.filter-tags.query_alert} by {node_name})
|
||||
* 1000, 0)
|
||||
> ${var.jvm_gc_young_collection_latency_threshold_critical}
|
||||
EOQ
|
||||
|
||||
@ -393,14 +399,17 @@ EOQ
|
||||
#
|
||||
resource "datadog_monitor" "indexing_latency" {
|
||||
count = var.indexing_latency_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average indexing time by document {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average indexing latency by document {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
|
||||
message = coalesce(var.indexing_latency_message, var.message)
|
||||
type = "query alert"
|
||||
|
||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||
query = <<EOQ
|
||||
${var.indexing_latency_time_aggregator}(${var.indexing_latency_timeframe}):
|
||||
avg:elasticsearch.indexing.index.time${module.filter-tags.query_alert} by {node_name}/ avg:elasticsearch.indexing.index.total${module.filter-tags.query_alert} by {node_name} * 1000
|
||||
default(
|
||||
diff(avg:elasticsearch.indexing.index.time${module.filter-tags.query_alert} by {node_name}) /
|
||||
diff(avg:elasticsearch.indexing.index.total${module.filter-tags.query_alert} by {node_name})
|
||||
* 1000, 0)
|
||||
> ${var.indexing_latency_threshold_critical}
|
||||
EOQ
|
||||
|
||||
@ -436,7 +445,10 @@ resource "datadog_monitor" "flush_latency" {
|
||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||
query = <<EOQ
|
||||
${var.flush_latency_time_aggregator}(${var.flush_latency_timeframe}):
|
||||
avg:elasticsearch.flush.total.time${module.filter-tags.query_alert} by {node_name} / avg:elasticsearch.flush.total${module.filter-tags.query_alert} by {node_name} * 1000
|
||||
default(
|
||||
diff(avg:elasticsearch.flush.total.time${module.filter-tags.query_alert} by {node_name}) /
|
||||
diff(avg:elasticsearch.flush.total${module.filter-tags.query_alert} by {node_name})
|
||||
* 1000, 0)
|
||||
> ${var.flush_latency_threshold_critical}
|
||||
EOQ
|
||||
|
||||
@ -520,7 +532,10 @@ resource "datadog_monitor" "search_query_latency" {
|
||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||
query = <<EOQ
|
||||
${var.search_query_latency_time_aggregator}(${var.search_query_latency_timeframe}):
|
||||
avg:elasticsearch.search.query.time${module.filter-tags.query_alert} by {node_name} / avg:elasticsearch.search.query.total${module.filter-tags.query_alert} by {node_name} * 1000
|
||||
default(
|
||||
diff(avg:elasticsearch.search.query.time${module.filter-tags.query_alert} by {node_name}) /
|
||||
diff(avg:elasticsearch.search.query.total${module.filter-tags.query_alert} by {node_name})
|
||||
* 1000, 0)
|
||||
> ${var.search_query_latency_threshold_critical}
|
||||
EOQ
|
||||
|
||||
@ -556,7 +571,10 @@ resource "datadog_monitor" "fetch_latency" {
|
||||
// TODO add tags to filter by node type and do not apply this monitor on non-data nodes
|
||||
query = <<EOQ
|
||||
${var.fetch_latency_time_aggregator}(${var.fetch_latency_timeframe}):
|
||||
avg:elasticsearch.search.fetch.time${module.filter-tags.query_alert} by {node_name} / avg:elasticsearch.search.fetch.total${module.filter-tags.query_alert} by {node_name} * 1000
|
||||
default(
|
||||
diff(avg:elasticsearch.search.fetch.time${module.filter-tags.query_alert} by {node_name}) /
|
||||
diff(avg:elasticsearch.search.fetch.total${module.filter-tags.query_alert} by {node_name})
|
||||
* 1000, 0)
|
||||
> ${var.fetch_latency_threshold_critical}
|
||||
EOQ
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user